# Analyze the co-occurences of the terms in the required qualifications

In [46]:
import os
import json
import string
import nltk
from nltk import word_tokenize, FreqDist, pos_tag
from nltk.corpus import stopwords
import pandas as pd
import pickle


In [47]:
df_ori = pd.read_csv('../data/data job posts.csv')
df_ori.head()
print(df_ori.shape)
df = df_ori.drop_duplicates(['RequiredQual'])
print(df.shape)
print("Removed {0} duplicates (based on RequiredQual)".format(df_ori.shape[0]-df.shape[0]))

(19001, 24)
(16689, 24)
Removed 2312 duplicates (based on RequiredQual)


In [48]:
df["RequiredQual"].head()

0    To perform this job successfully, an\r\nindivi...
1    - Bachelor's Degree; Master's is preferred;\r\...
2    - Degree in environmentally related field, or ...
3    - Advanced degree in public health, social sci...
4    - University degree; economical background is ...
Name: RequiredQual, dtype: object

In [49]:
df["RequiredQual"] = df["RequiredQual"].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


## Try the NLTK POS tagger

In [7]:
sent = '''Professor Tan Eng Chye, NUS Deputy President and Provost, and Professor 
Menahem Ben-Sasson, President of HUJ signed the joint degree agreement at NUS, 
in the presence of Ambassador of Israel to Singapore Her Excellency Amira Arnon 
and about 30 invited guests, on July 03, 2013.
'''
pos_tagged = pos_tag(word_tokenize(sent), tagset='universal')
#pos_tagged

In [8]:
df['RequiredQual_token'] = df['RequiredQual'].map(word_tokenize)
df['Required_Qual_POS'] = df['RequiredQual_token'].map(pos_tag, {"tagset": "universal"})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [9]:
df['Required_Qual_POS'].head()

0    [(To, TO), (perform, VB), (this, DT), (job, NN...
1    [(-, :), (Bachelor, NN), ('s, POS), (Degree, N...
2    [(-, :), (Degree, NN), (in, IN), (environmenta...
3    [(-, :), (Advanced, VBD), (degree, JJ), (in, I...
4    [(-, :), (University, NNP), (degree, VBP), (;,...
Name: Required_Qual_POS, dtype: object

In [11]:
# df['Required_Qual_POS'].to_csv("../outputs/nltk_pos1.csv", index=False)

## Read the POS tagged data for further analysis

In [14]:
pos_tag_df = pd.read_csv("../outputs/nltk_pos1.csv", header=None)

In [15]:
pos_tag_df.columns

Int64Index([0], dtype='int64')

In [17]:
pos_tag_df.columns = ["Qual_POS_Tagged"]

In [29]:
pos_tag_df.head(5)

Unnamed: 0,Qual_POS_Tagged
0,"[('To', 'TO'), ('perform', 'VB'), ('this', 'DT..."
1,"[('-', ':'), ('Bachelor', 'NN'), (""'s"", 'POS')..."
2,"[('-', ':'), ('Degree', 'NN'), ('in', 'IN'), (..."
3,"[('-', ':'), ('Advanced', 'VBD'), ('degree', '..."
4,"[('-', ':'), ('University', 'NNP'), ('degree',..."


In [41]:
import ast
noun_family = ["NN", "NNS", "NNP", "NNPS"]
def extract_noun_family(word_pos_pairs):
    
    if isinstance(word_pos_pairs, str):
        # https://stackoverflow.com/questions/10775894/converting-a-string-representation-of-a-list-into-an-actual-list-object
        word_pos_pairs = ast.literal_eval(word_pos_pairs)
    
    nouns_with_types = []
    for word_pos_pair in word_pos_pairs:
        #print("word_pos_pair: {0}".format(word_pos_pair))
        word = word_pos_pair[0]
        pos_type = word_pos_pair[1]
        if pos_type in noun_family:
            nouns_with_types.append((word, pos_type))
            
    return nouns_with_types
            
        

In [42]:
res = extract_noun_family([('To', 'TO'), ('perform', 'VB'), ('this', 'DT'), ('job', 'NN'), ('successfully', 'RB')])
res

[('job', 'NN')]

In [43]:
pos_tag_df["nouns"] = pos_tag_df["Qual_POS_Tagged"].map(extract_noun_family)
pos_tag_df["nouns"].head()

0    [(job, NN), (duty, NN), (requirements, NNS), (...
1    [(Bachelor, NN), (Degree, NNP), (Master, NNP),...
2    [(Degree, NN), (field, NN), (years, NNS), (exp...
3    [(health, NN), (science, NN), (communication, ...
4    [(University, NNP), (background, NN), (plus, N...
Name: nouns, dtype: object

## NLTK POS tag again with only nouns

In [51]:
df['RequiredQual_token'] = df['RequiredQual'].map(word_tokenize)
df["nouns"] = df["RequiredQual_token"].map(pos_tag, {"tagset": "NOUN"})
df["nouns"].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


0    [(To, TO), (perform, VB), (this, DT), (job, NN...
1    [(-, :), (Bachelor, NN), ('s, POS), (Degree, N...
2    [(-, :), (Degree, NN), (in, IN), (environmenta...
3    [(-, :), (Advanced, VBD), (degree, JJ), (in, I...
4    [(-, :), (University, NNP), (degree, VBP), (;,...
Name: nouns, dtype: object

## Write the nouns to a file

In [52]:
pos_tag_df["nouns"].to_csv("../outputs/nltk_pos_nouns.csv", index=False)

## Named entity recognition to extract degree requirements from the required qualifications

In [54]:
pos_tag_df["Qual_POS_Tagged"].head()

0    [('To', 'TO'), ('perform', 'VB'), ('this', 'DT...
1    [('-', ':'), ('Bachelor', 'NN'), ("'s", 'POS')...
2    [('-', ':'), ('Degree', 'NN'), ('in', 'IN'), (...
3    [('-', ':'), ('Advanced', 'VBD'), ('degree', '...
4    [('-', ':'), ('University', 'NNP'), ('degree',...
Name: Qual_POS_Tagged, dtype: object

In [55]:
from nltk import ne_chunk

def perform_NER(word_pos_pairs):
    if isinstance(word_pos_pairs, str):
        # https://stackoverflow.com/questions/10775894/converting-a-string-representation-of-a-list-into-an-actual-list-object
        word_pos_pairs = ast.literal_eval(word_pos_pairs)
        
    return ne_chunk(word_pos_pairs)

pos_tag_df["Qual_Chuncked"] = pos_tag_df["Qual_POS_Tagged"].map(perform_NER)
pos_tag_df["Qual_Chuncked"].head()

0    [(To, TO), (perform, VB), (this, DT), (job, NN...
1    [(-, :), [(Bachelor, NN)], ('s, POS), [(Degree...
2    [(-, :), [(Degree, NN)], (in, IN), (environmen...
3    [(-, :), (Advanced, VBD), (degree, JJ), (in, I...
4    [(-, :), [(University, NNP)], (degree, VBP), (...
Name: Qual_Chuncked, dtype: object

In [56]:
pos_tag_df["Qual_Chuncked"].to_csv("../outputs/nltk_ner.csv", index=False, header="NER_output")