### Import Libraries

In [1]:
import re
import pandas as pd
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.chunk import RegexpParser
from nltk import word_tokenize, pos_tag
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
string.punctuation

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

### Preprocessing

In [3]:
# read data

data = pd.read_excel('data/isco.xlsx')
data = data[data["ISCO 08 Code"].apply(lambda x: len(str(x)) > 3)]
data = data[["Title EN", "Tasks include"]]
data.head()

Unnamed: 0,Title EN,Tasks include
3,Legislators,Tasks include -\n(a) presiding over or partic...
4,Senior Government Officials,"Tasks include -\n(a) advising national, state..."
5,Traditional Chiefs and Heads of Villages,Tasks include -\n(a) allocating the use of co...
6,Senior Officials of Special-interest Organizat...,Tasks include -\n(a) determining and formulat...
8,Managing Directors and Chief Executives,"Tasks include -\n(a) planning, directing and ..."


In [4]:
# remove Tasks include -\n(a) 

data['Tasks include'] = data['Tasks include'].str.replace('Tasks include -\n(a) ', '', regex=False)
data.head()

Unnamed: 0,Title EN,Tasks include
3,Legislators,presiding over or participating in the procee...
4,Senior Government Officials,"advising national, state, regional or local g..."
5,Traditional Chiefs and Heads of Villages,allocating the use of communal land and other...
6,Senior Officials of Special-interest Organizat...,"determining and formulating the policies, rul..."
8,Managing Directors and Chief Executives,"planning, directing and coordinating the gene..."


In [5]:
# check missing value

print(data.isna().sum())

Title EN         0
Tasks include    6
dtype: int64


In [8]:
# check which occupation have no task description

no_task = data[data['Tasks include'].isnull()]['Title EN']
no_task

46            Services Managers Not Elsewhere Classified
195    Process Control Technicians Not Elsewhere Clas...
273    Other Artistic and Cultural Associate Professi...
367               Sales Workers Not Elsewhere Classified
462          Handicraft Workers Not Elsewhere Classified
535    Stationary Plant and Machine Operators Not Els...
Name: Title EN, dtype: object

In [9]:
# remove missing value (dihapus karena yang null adalah jenis pekerjaan lainnya)

data.dropna(inplace=True)
data.head()

Unnamed: 0,Title EN,Tasks include
3,Legislators,presiding over or participating in the procee...
4,Senior Government Officials,"advising national, state, regional or local g..."
5,Traditional Chiefs and Heads of Villages,allocating the use of communal land and other...
6,Senior Officials of Special-interest Organizat...,"determining and formulating the policies, rul..."
8,Managing Directors and Chief Executives,"planning, directing and coordinating the gene..."


In [10]:
# lowering text
data['Tasks include']= data['Tasks include'].apply(lambda x: x.lower())

# Print the updated DataFrame
data['Tasks include'].head()

3     presiding over or participating in the procee...
4     advising national, state, regional or local g...
5     allocating the use of communal land and other...
6     determining and formulating the policies, rul...
8     planning, directing and coordinating the gene...
Name: Tasks include, dtype: object

In [12]:
# Function to tokenize each sentence, tokenize each word from sentence, then remove stopwords
def tokenize_and_remove_punctuation(text):
    # Tokenize each sentence
    sentences = re.split(r'(?<=[.!?,;])\s+', text)
    
    # Remove punctuation and tokenize each word in each sentence
    tokenized_sentences = []
    for sentence in sentences:
        tokens = word_tokenize(sentence)
        # Remove punctuation
        tokens = [token for token in tokens if token not in string.punctuation]
        tokenized_sentences.append(tokens)
    
    return tokenized_sentences

# Tokenize each sentence and remove punctuation
data['tokenized'] = data['Tasks include'].apply(tokenize_and_remove_punctuation)

# Print the updated DataFrame
print(data['tokenized'])

3      [[presiding, over, or, participating, in, the,...
4      [[advising, national], [state], [regional, or,...
5      [[allocating, the, use, of, communal, land, an...
6      [[determining, and, formulating, the, policies...
8      [[planning], [directing, and, coordinating, th...
                             ...                        
604    [[delivering, messages], [packages, and, other...
605    [[repairing, broken, windows], [screens], [doo...
606    [[filling, storage, areas, of, vending, machin...
607    [[cutting, and, collecting, wood, from, forest...
608    [[in, such, cases, tasks, would, include, a, s...
Name: tokenized, Length: 427, dtype: object


### POS Tag

In [14]:
# Function to perform POS tagging on each token
def pos_tag_tokens(tokenized_sentences):
    pos_tagged_sentences = []
    for sentence_tokens in tokenized_sentences:
        pos_tags = pos_tag(sentence_tokens)
        pos_tagged_sentences.append(pos_tags)
    return pos_tagged_sentences

# Perform POS tagging on tokenized sentences
data['pos_tagged'] = data['tokenized'].apply(pos_tag_tokens)

# Print the updated DataFrame
print(data['pos_tagged'])

3      [[(presiding, VBG), (over, RP), (or, CC), (par...
4      [[(advising, VBG), (national, JJ)], [(state, N...
5      [[(allocating, VBG), (the, DT), (use, NN), (of...
6      [[(determining, VBG), (and, CC), (formulating,...
8      [[(planning, NN)], [(directing, NN), (and, CC)...
                             ...                        
604    [[(delivering, NN), (messages, NNS)], [(packag...
605    [[(repairing, VBG), (broken, JJ), (windows, NN...
606    [[(filling, VBG), (storage, NN), (areas, NNS),...
607    [[(cutting, VBG), (and, CC), (collecting, VBG)...
608    [[(in, IN), (such, JJ), (cases, NNS), (tasks, ...
Name: pos_tagged, Length: 427, dtype: object


In [15]:
# Function to lemmatize tokens
def lemmatize_tokens(pos_tagged_sentences):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentences = []
    for sentence_tags in pos_tagged_sentences:
        lemmatized_tokens = [(lemmatizer.lemmatize(token, pos=get_wordnet_pos(pos_tag)), pos_tag) for token, pos_tag in sentence_tags]
        lemmatized_sentences.append(lemmatized_tokens)
    return lemmatized_sentences

# Function to map POS tags to WordNet POS tags
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return nltk.corpus.wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return nltk.corpus.wordnet.VERB
    elif treebank_tag.startswith('N'):
        return nltk.corpus.wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return nltk.corpus.wordnet.ADV
    else:
        return nltk.corpus.wordnet.NOUN  # Default to noun if POS tag not recognized

# Lemmatize the tokens in 'pos_tagged_sentences' column
data['lemmatized'] = data['pos_tagged'].apply(lemmatize_tokens)

# Print the updated DataFrame
print(data['lemmatized'])

3      [[(preside, VBG), (over, RP), (or, CC), (parti...
4      [[(advise, VBG), (national, JJ)], [(state, NN)...
5      [[(allocate, VBG), (the, DT), (use, NN), (of, ...
6      [[(determine, VBG), (and, CC), (formulate, VBG...
8      [[(planning, NN)], [(directing, NN), (and, CC)...
                             ...                        
604    [[(delivering, NN), (message, NNS)], [(package...
605    [[(repair, VBG), (broken, JJ), (window, NNS)],...
606    [[(fill, VBG), (storage, NN), (area, NNS), (of...
607    [[(cut, VBG), (and, CC), (collect, VBG), (wood...
608    [[(in, IN), (such, JJ), (case, NNS), (task, NN...
Name: lemmatized, Length: 427, dtype: object


### POS Chunk

In [16]:
# pos chunk verb noun pairs

def extract_verb_noun_pairs(tagged_texts):
    chunks = []
    for tagged_sentence in tagged_texts:
        result = filter_verb_noun_pairs(tagged_sentence)
        chunks.append(result)
    chunks = [item for sublist in chunks for item in sublist]
    return chunks

def filter_verb_noun_pairs(tagged_sentence):
    verb_noun_pairs = []
    for i in range(len(tagged_sentence) - 1):
        word, tag = tagged_sentence[i]
        next_word, next_tag = tagged_sentence[i + 1]
        if tag.startswith('VB') and next_tag.startswith('NN'):
            verb = word
            noun = next_word
            verb_noun_pairs.append((verb, noun))
    return verb_noun_pairs

# Apply chunking and extraction to the 'tagged_sentences' column
data['chunked_verb_noun'] = data['lemmatized'].apply(extract_verb_noun_pairs)

# Print the resulting DataFrame with chunked data
data['chunked_verb_noun']

3      [(direct, policy), (repeal, law), (investigate...
4      [(include, amendment), (establish, objective),...
5              [(settle, dispute), (discipline, member)]
6      [(govern, body), (direct, section), (evaluate,...
8      [(govern, body), (determine, objective), (mana...
                             ...                        
604    [(deliver, luggage), (mark, baggage), (attach,...
605     [(adjust, door), (replace, tap), (unload, coal)]
606    [(fill, storage), (vend, machine), (collect, m...
607    [(collect, wood), (visit, forest), (pick, piec...
608    [(collect, ticket), (examine, ticket), (verify...
Name: chunked_verb_noun, Length: 427, dtype: object

In [17]:
# POS chunk new pattern (Pak Indra)

# define pattern
pattern = r'''Chunk: {(<ADJ|VB\w*>+<\.|RP|IN|CC|PRP\w*>*<DT>*<NN\w*|RB\w*>+)+}'''
chunk_parser = RegexpParser(pattern)

def extract(tagged_texts):
    chunks = []
    for tagged_text in tagged_texts:
        tree = chunk_parser.parse(tagged_text)
        tree = extract_chunks(tree)
        tree = trees_to_tuples(tree)
        chunks.append(tree)
    chunks = [item for sublist in chunks for item in sublist]
    return chunks

def extract_chunks(tree):
    chunks = []
    
    if isinstance(tree, nltk.Tree):
        if tree.label() != 'S':  # Exclude sentence-level chunks if any
            chunks.append(tree)
        for subtree in tree:
            chunks.extend(extract_chunks(subtree))
    
    return chunks

def trees_to_tuples(tree_list):
    tuple_list = [tuple(leaf[0] for leaf in tree.leaves()) for tree in tree_list]
    return tuple_list

# Apply chunking and extraction to the 'tagged_sentences' column
data['chunked_new_pattern'] = data['lemmatized'].apply(extract)

# Print the resulting DataFrame with chunked data
data['chunked_new_pattern']



3      [(participate, in, the, proceeding), (direct, ...
4      [(advise, on, the, preparation), (include, ame...
5      [(allocate, the, use), (settle, dispute), (dis...
6      [(formulate, the, policy), (review, the, opera...
8      [(review, the, operation), (govern, body), (de...
                             ...                        
604    [(deliver, luggage), (mark, baggage), (attach,...
605    [(adjust, door), (replace, tap, washer), (put,...
606    [(fill, storage, area), (vend, machine), (coll...
607    [(collect, wood), (visit, forest), (pick, piec...
608    [(include, a, selling, admission, ticket), (co...
Name: chunked_new_pattern, Length: 427, dtype: object

### Export Result

In [12]:
# # hasile

# patent_task = data[["title_and_abstract", "chunked_verb_noun", "chunked_new_pattern"]]
# patent_task.to_excel('export_result/chunk2.xlsx', index=False)