In [1]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
import string
import itertools
import pandas as pd

stopwords = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\melania\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\melania\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def _trees_to_keyphrases(parsed_chunks):
    """
    Helper method to extract keyphrases as space-separated text from the trees of parsed chunks.
    """
    # Convert extracted to flat (CONLL - Begining, Inside, Outside of chunk) format
    chunks2conll = [nltk.chunk.tree2conlltags(_chunk) for _chunk in parsed_chunks]
    
    chunks2groups = [(key, list(group)) for _sent in chunks2conll 
                     for key, group in itertools.groupby(_sent, lambda x : x[2] != 'O')]
    
    # Get only the keyphrases:
    keyphrases = [" ".join(x[0] for x in group) for key, group in chunks2groups if key]
    keyphrases = [_kp for _kp in keyphrases if all(_s not in string.punctuation for _s in _kp)]
    
    return keyphrases
    
    
def select_by_pos_tag(sentences, regexp, verbose=False):
    # POS-tag sentences 
    pos_tagged_sentences = [nltk.pos_tag(_sentence) for _sentence in sentences]
    if verbose: print("1. Pos-tagged sentences: ", pos_tagged_sentences[0])
    
    # Extract chunks matching the regexp from the sentences
    chunker = nltk.chunk.regexp.RegexpParser(regexp)
    sentence_chunks = [chunker.parse(_sentence) for _sentence in pos_tagged_sentences]
    if verbose: print("2. Extracted chunks: ", sentence_chunks[0])
    
    # Extract the keyphrases from the tree format
    candidates = _trees_to_keyphrases(sentence_chunks)
    if verbose: print("3. Extracted keyphrases:", candidates[:10])
    
    return set(candidates)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict, Counter

def score_keyphrases_by_tfidf(texts, candidates, squashing_func=sum):
    vectorizer = TfidfVectorizer(ngram_range=(1,3), vocabulary=candidates)
    train_corpus = vectorizer.fit_transform(texts)
    
    kp_tfidfs = defaultdict(lambda: [])
    for kp in candidates:
        kp_tfidfs[kp] += [train_corpus[0].toarray()[0][vectorizer.vocabulary_[kp]]]
    
    squashed_results = {_kp: squashing_func(_v) for _kp, _v in kp_tfidfs.items()}
    return squashed_results

In [6]:
#1. TF_IDF for Indeed.com
from nltk.tokenize import TweetTokenizer
tweet_tok = TweetTokenizer()
df = pd.read_csv("indeed - indeed.csv", header = None, names = ["Title", "Text"])
full_text = "\n\n\n\n".join(df['Text'])
sentences = nltk.sent_tokenize(full_text)
token_sentences = [tweet_tok.tokenize(sentence) for sentence in sentences]
print(token_sentences[0:4])

[['Fullstack', 'Java', 'Developer', '(', 'm', '/', 'f', ')', 'gamigo', 'AG-Redwood', 'City', ',', 'CAAbout', 'us', 'The', 'gamigo', 'group', 'is', 'a', 'thriving', 'and', 'fast-growing', 'company', 'within', 'the', 'games', 'market', '.'], ['We', 'have', 'an', 'ample', 'portfolio', 'of', 'online', 'and', 'mobile', 'games', 'at', 'command', ',', 'including', 'a', 'variety', 'of', 'leading', 'MMO', 'role', 'playing', 'and', 'strategy', 'games', '.'], ['Our', 'company', 'focuses', 'primarily', 'on', 'games', 'operations', 'on', 'the', 'European', 'and', 'American', 'market', '.'], ['With', 'millions', 'registered', 'users', 'and', 'a', 'broad', 'range', 'of', 'gaming', 'portals', ',', 'the', 'gamigo', 'group', 'turned', 'into', 'one', 'of', 'the', 'leading', 'gaming', 'providers', '.']]


In [14]:
grammar = r'KT: {(<NN><NN>|<JJ><NN>|<FW>)}'
candidates = select_by_pos_tag(token_sentences, grammar, verbose=True)

1. Pos-tagged sentences:  [('Senior', 'JJ'), ('Java', 'NNP'), ('Developer', 'NNP'), ('–', 'NNP'), ('Syncplicity', 'NNP'), ('We', 'PRP'), ('are', 'VBP'), ('searching', 'VBG'), ('for', 'IN'), ('a', 'DT'), ('senior', 'JJ'), ('developer', 'NN'), ('in', 'IN'), ('our', 'PRP$'), ('Java', 'NNP'), ('Syncplicity', 'NNP'), ('team', 'NN'), ('.', '.')]
2. Extracted chunks:  (S
  Senior/JJ
  Java/NNP
  Developer/NNP
  –/NNP
  Syncplicity/NNP
  We/PRP
  are/VBP
  searching/VBG
  for/IN
  a/DT
  (KT senior/JJ developer/NN)
  in/IN
  our/PRP$
  Java/NNP
  Syncplicity/NNP
  team/NN
  ./.)
3. Extracted keyphrases: ['senior developer', 'great opportunity', 'competitive remuneration', 'international company', 'full potential', 'Working time', 'office space', 'various entertainment', 'table tennis', 'Regular team']


In [8]:
candidates

{'other technology',
 'item management',
 'Project leadership',
 'software system',
 'parental leave',
 'Major Market',
 'skillsStrong understanding',
 'clear code',
 'client leadership',
 'award winning',
 'career opportunity',
 'code development',
 'management information',
 'new feature',
 'similar development',
 'application technology',
 'high velocity work ethos',
 'world today',
 'similar build',
 'service level',
 'test design',
 'edge technology',
 'distributed system design Exposure',
 'handson experience',
 'maintainable Work',
 'web presentation',
 'distributed microservice',
 'agile environment',
 'detailed unit',
 'll need',
 'premier client',
 'enable client',
 'monitor software',
 'own pay',
 'Direct Hire',
 'emergency systemsOur',
 'deferred compensation',
 'bottom line',
 'technical mentor',
 'remote heart',
 'scrum agile development methodology',
 'share knowledge',
 'program access',
 'dynamic organization',
 'business consultation',
 'full service',
 'willing mento

In [10]:
#Getting TF-IDF scores 
documents = df['Text']
kp_tfidfs = score_keyphrases_by_tfidf(documents, candidates)
Counter(kp_tfidfs).most_common(35)

[('close collaboration', 0.3356977535618584),
 ('gamigo group', 0.3356977535618584),
 ('role playing', 0.1678488767809292),
 ('office kitchen', 0.1678488767809292),
 ('single step', 0.1678488767809292),
 ('strategic optimization', 0.1678488767809292),
 ('functional code', 0.1678488767809292),
 ('energetic company', 0.1678488767809292),
 ('international team', 0.1678488767809292),
 ('commercial use', 0.1678488767809292),
 ('follow lean', 0.1678488767809292),
 ('account system', 0.1678488767809292),
 ('ample portfolio', 0.1678488767809292),
 ('major technology', 0.1678488767809292),
 ('production platform', 0.1678488767809292),
 ('practical experience', 0.156587570211699),
 ('platform team', 0.156587570211699),
 ('web architecture', 0.156587570211699),
 ('dynamic work', 0.156587570211699),
 ('experienced engineer', 0.156587570211699),
 ('broad range', 0.1373362394762297),
 ('health care', 0.1373362394762297),
 ('share knowledge', 0.1330548894200199),
 ('strong background', 0.133054889420

In [5]:
scores = [('close collaboration', 0.3356977535618584),
 ('gamigo group', 0.3356977535618584),
 ('role playing', 0.1678488767809292),
 ('office kitchen', 0.1678488767809292),
 ('single step', 0.1678488767809292),
 ('strategic optimization', 0.1678488767809292),
 ('functional code', 0.1678488767809292),
 ('energetic company', 0.1678488767809292),
 ('international team', 0.1678488767809292),
 ('commercial use', 0.1678488767809292),
 ('follow lean', 0.1678488767809292),
 ('account system', 0.1678488767809292),
 ('ample portfolio', 0.1678488767809292),
 ('major technology', 0.1678488767809292),
 ('production platform', 0.1678488767809292),
 ('practical experience', 0.156587570211699),
 ('platform team', 0.156587570211699),
 ('web architecture', 0.156587570211699)]
pd.DataFrame(scores)

Unnamed: 0,0,1
0,close collaboration,0.335698
1,gamigo group,0.335698
2,role playing,0.167849
3,office kitchen,0.167849
4,single step,0.167849
5,strategic optimization,0.167849
6,functional code,0.167849
7,energetic company,0.167849
8,international team,0.167849
9,commercial use,0.167849


In [12]:
#2. JObs.bg
df_jobs = pd.read_csv("jobsbg_javaInTitle.csv", header = None, names = ["Title", "Text"])
full_text = "\n\n\n\n".join(df_jobs['Text'])
sentences = nltk.sent_tokenize(full_text)
token_sentences = [tweet_tok.tokenize(sentence) for sentence in sentences]
print(token_sentences[0:4])

[['Senior', 'Java', 'Developer', '–', 'Syncplicity', 'We', 'are', 'searching', 'for', 'a', 'senior', 'developer', 'in', 'our', 'Java', 'Syncplicity', 'team', '.'], ['This', 'team', 'is', 'responsible', 'for', 'Java', '/', 'Linux', 'based', 'cloud', 'micro-services', 'and', 'on-premises', 'connectors', 'that', 'extend', 'the', 'capabilities', 'of', 'the', 'Syncplicity', 'SaaS', 'offering', '.'], ['Working', 'in', 'the', 'Syncplicity', 'team', 'gives', 'you', 'a', 'great', 'opportunity', 'to', 'learn', 'some', 'of', 'the', 'latest', 'in', 'Java', ',', 'Linux', ',', 'AWS', ',', 'security', 'and', 'deployment', 'technologies', 'to', 'advance', 'your', 'skills', 'and', 'career', '.'], ['I', 'have', 'the', 'luxury', 'to', 'work', 'with', 'young', 'and', 'very', 'talented', 'people', '.']]


In [15]:
grammar = r'KT: {(<NN><NN>|<JJ><NN>|<FW>)}'
candidates = select_by_pos_tag(token_sentences, grammar, verbose=True)

1. Pos-tagged sentences:  [('Senior', 'JJ'), ('Java', 'NNP'), ('Developer', 'NNP'), ('–', 'NNP'), ('Syncplicity', 'NNP'), ('We', 'PRP'), ('are', 'VBP'), ('searching', 'VBG'), ('for', 'IN'), ('a', 'DT'), ('senior', 'JJ'), ('developer', 'NN'), ('in', 'IN'), ('our', 'PRP$'), ('Java', 'NNP'), ('Syncplicity', 'NNP'), ('team', 'NN'), ('.', '.')]
2. Extracted chunks:  (S
  Senior/JJ
  Java/NNP
  Developer/NNP
  –/NNP
  Syncplicity/NNP
  We/PRP
  are/VBP
  searching/VBG
  for/IN
  a/DT
  (KT senior/JJ developer/NN)
  in/IN
  our/PRP$
  Java/NNP
  Syncplicity/NNP
  team/NN
  ./.)
3. Extracted keyphrases: ['senior developer', 'great opportunity', 'competitive remuneration', 'international company', 'full potential', 'Working time', 'office space', 'various entertainment', 'table tennis', 'Regular team']


In [17]:
candidates

{'increased demand',
 'virtual replication',
 'new business',
 'Atlassian apps',
 'particular department',
 'clear code',
 'die wir',
 'career opportunity',
 'united team',
 'encouraging work',
 'subject line',
 'project domain',
 'salary –',
 'driven organization',
 'sunny office',
 'single thing',
 'regional skill',
 'assume responsibility',
 'electronic book',
 'customer portfolio',
 'following software',
 'consumer credit',
 'collegiate manner',
 'commercial system',
 'new team',
 'international distribution',
 'global product',
 'висше образование',
 'good command',
 'share knowledge',
 'Good level',
 'multinational working',
 'career enhancement',
 'reference code',
 'dynamic team',
 'due consideration',
 'professional agenda',
 'global community',
 'English communication',
 'present client',
 'related subject',
 'scrum team',
 'national origin',
 'central platform',
 'working environment',
 'graph database',
 'sneak peek',
 'international work',
 'Good interpersonal',
 'core dev

In [18]:
#Getting TF-IDF scores 
documents = df_jobs['Text']
kp_tfidfs = score_keyphrases_by_tfidf(documents, candidates)
Counter(kp_tfidfs).most_common(35)

[('senior member', 0.2812052403247175),
 ('robust design', 0.2812052403247175),
 ('various entertainment', 0.2812052403247175),
 ('crisp definition', 0.2812052403247175),
 ('great opportunity', 0.25932954541363096),
 ('technical lead', 0.25932954541363096),
 ('senior developer', 0.25932954541363096),
 ('office space', 0.24380849286805553),
 ('depth knowledge', 0.23176942920434854),
 ('table tennis', 0.23176942920434854),
 ('full potential', 0.23176942920434854),
 ('architecture design', 0.21361604473997434),
 ('working time', 0.2064117454113936),
 ('verbal', 0.1943726817476866),
 ('regular team', 0.16901499795473163),
 ('international company', 0.15434360237222586),
 ('working environment', 0.1518337497389472),
 ('problem solving', 0.14493687062731764),
 ('competitive remuneration', 0.13510023937993812),
 ('remuneration package', 0.11824696847874398),
 ('java developer', 0.08151221831038559),
 ('increased demand', 0.0),
 ('virtual replication', 0.0),
 ('new business', 0.0),
 ('Atlassia

In [4]:
scores = [('senior member', 0.2812052403247175),
 ('robust design', 0.2812052403247175),
 ('various entertainment', 0.2812052403247175),
 ('crisp definition', 0.2812052403247175),
 ('great opportunity', 0.25932954541363096),
 ('technical lead', 0.25932954541363096),
 ('senior developer', 0.25932954541363096),
 ('office space', 0.24380849286805553),
 ('depth knowledge', 0.23176942920434854),
 ('table tennis', 0.23176942920434854),
 ('full potential', 0.23176942920434854),
 ('architecture design', 0.21361604473997434),
 ('working time', 0.2064117454113936),
 ('verbal', 0.1943726817476866),
 ('regular team', 0.16901499795473163),
 ('international company', 0.15434360237222586),
 ('working environment', 0.1518337497389472),
 ('problem solving', 0.14493687062731764),
 ('competitive remuneration', 0.13510023937993812),
 ('remuneration package', 0.11824696847874398),
 ('java developer', 0.08151221831038559),
 ('increased demand', 0.0),
 ('virtual replication', 0.0),
 ('new business', 0.0),
 ('Atlassian apps', 0.0),
 ('particular department', 0.0),
 ('clear code', 0.0),
 ('die wir', 0.0),
 ('career opportunity', 0.0),
 ('united team', 0.0),
 ('encouraging work', 0.0),
 ('subject line', 0.0),
 ('project domain', 0.0),
 ('salary –', 0.0),
 ('driven organization', 0.0)]
import pandas as pd
pd.DataFrame(scores)

Unnamed: 0,0,1
0,senior member,0.281205
1,robust design,0.281205
2,various entertainment,0.281205
3,crisp definition,0.281205
4,great opportunity,0.25933
5,technical lead,0.25933
6,senior developer,0.25933
7,office space,0.243808
8,depth knowledge,0.231769
9,table tennis,0.231769


In [30]:
df_jobs["category"] = "BG"
df_jobs.head()

Unnamed: 0,Title,Text,category
0,Senior Java Developer - Syncplicity,\n\n\nSenior Java Developer – Syncplicity\nWe ...,BG
1,Java Software Engineer,Let us\n ...,BG
2,Senior Java Developer with Oracle Experience,Senior Java Developer with Oracle Experience\n...,BG
3,Senior Software Engineer with Java and Spring,\nSenior Software Engineer with Java and Sprin...,BG
4,Java Software Developer Aviation,\n\n\n\n\n\n\n\n\n\n\n\nJava Software Develope...,BG


In [39]:
df["category"] = "USA"
df_all = pd.concat([df, df_jobs])
df.tail()

Unnamed: 0,Title,Text,category
304,"Lead SW Platform Developer (Java, Spring Boot,...","Lead SW Platform Developer (Java, Spring Boot,...",USA
305,Software Engineer Java Developer (2+ yrs),Software Engineer Java Developer (2+ yrs)Techn...,USA
306,"Full Stack Lead Developer with Angular, Node.j...","Full Stack Lead Developer with Angular, Node.j...",USA
307,Java developer analytics and big data software...,Java developer analytics and big data software...,USA
308,"Sr Software Developers- Ruby, Java or Javascri...","Sr Software Developers- Ruby, Java or Javascri...",USA


In [71]:
df_all.tail()

Unnamed: 0,Title,Text,category
129,Software Engineer (Python/Java/Go),\n\n\n\n\n\n\n\n\n\n\n\n\nSoftware Engineer (P...,BG
130,Java Developer - Middle level,\n\n\n\n\n\n\n\n\n\n\n\n\nJava Developer - Mid...,BG
131,Senior Java developer,\n\n\n\n\n\n\n\n\n\n\nSenior Java developer\n\...,BG
132,JAVA DEVELOPER,\n\n\n\n\n\n\n\n\n\n\n\n\nJAVA DEVELOPER\n\n\n...,BG
133,Senior Java Engineer,\n\n\n\n\n\n\n\n\n\n\n\n\nSenior Java Engineer...,BG


## Pointwise Mutual Information 

- how likely is it to encounter a __word in a specific category__?

$ \operatorname{PMI}(word, category) = \log\frac{p(word, category)}{p(word)p(category)} $ <br>

In [54]:
from nltk.tokenize import TweetTokenizer
def all_words(file_list):
    """return list of all words in the files.""" 
    full_text = "\n\n".join(file_list)
    tweet_tok = TweetTokenizer()
    tokens = tweet_tok.tokenize(full_text)  
    tokens_lower = [token.lower() for token in tokens]
    tokens_no_punct = [token for token in tokens_lower if token not in ".!?-/\()[]{},:;`'\""]
    return tokens_no_punct

In [68]:
# Pontwise mutual information 
def pmi(category):
#     category_files = nltk.corpus.reuters.fileids(category)
#     category_files_words = [_word for _fid in category_files for _word in nltk.corpus.reuters.words(_fid)]
#     total_number_of_words_in_category = len(category_files_words)
#     total_number_of_words = len(nltk.corpus.reuters.words())

    category_files = df_all[df_all["category"]==category]["Text"]
    category_files_words = all_words(category_files)
    total_number_of_words_in_category = len(category_files_words)
    total_words = all_words(df["Text"])
    total_number_of_words = len(total_words)
    
    word_frequencies = Counter(category_files_words)
    word_frequencies_in_category = Counter(total_words)
    
    word_pmis = {}
    for word in set(category_files_words):
        if word in string.punctuation or word in stopwords or len(word)<3:
            continue
        p_wc =  word_frequencies_in_category[word] / total_number_of_words_in_category
        p_w = word_frequencies[word] / total_number_of_words
        p_c = len(df_all[df_all["category"]==category]) / len(df_all)
        word_pmis[word] = p_wc / (p_w * p_c)
    return word_pmis

In [69]:
pmi_usa = pmi('USA')
Counter(pmi_usa).most_common(20)

[('interfaces', 1.4336569579288028),
 ('markets', 1.4336569579288028),
 ('professionals', 1.4336569579288028),
 ('government', 1.4336569579288028),
 ('department', 1.4336569579288028),
 ('make', 1.4336569579288028),
 ('motivated', 1.4336569579288028),
 ('strategy', 1.4336569579288028),
 ('along', 1.4336569579288028),
 ('sources', 1.4336569579288028),
 ('leveraging', 1.4336569579288028),
 ('occasional', 1.4336569579288028),
 ('top', 1.4336569579288028),
 ('reviews-los', 1.4336569579288028),
 ('little', 1.4336569579288028),
 ('agile', 1.4336569579288028),
 ('overview', 1.4336569579288028),
 ('google', 1.4336569579288028),
 ('critical', 1.4336569579288028),
 ('fields', 1.4336569579288028)]

In [70]:
pmi_bg = pmi('BG')
Counter(pmi_bg).most_common(20)

[('ago', 1947.1563755246882),
 ('save', 1947.1563755246882),
 ('alternative', 1080.3577309362786),
 ('report', 659.5207078390073),
 ('francisco', 439.6804718926715),
 ('screening', 360.1192436454262),
 ('disability', 339.1820783172037),
 ('veteran', 320.3386295218035),
 ('inc', 309.87004685769233),
 ('religion', 301.4951807264033),
 ('origin', 301.4951807264033),
 ('san', 266.94885793483627),
 ('type', 251.24598393866944),
 ('methods', 236.17122490234928),
 ('preferred', 224.02766901198024),
 ('status', 211.04662650848232),
 ('marital', 200.99678715093552),
 ('gender', 194.7156375524688),
 ('instructions', 182.15333835553537),
 ("bachelor's", 179.01276355630196)]