In [None]:
import os
import os.path as osp
from textblob import Word, TextBlob
from nltk.stem import PorterStemmer
import nltk
from nltk.corpus import stopwords
import re
import contractions
from collections import Counter
import unicodedata
import json
import numpy as np
import itertools

In [None]:
# %%
def extract_noun_phrases(pos_tags):

    def parse_noun_phrase(indices, num_tokens):
        """
        Input of this function is the indices of possible begining positions of a noun phrases and the number of tokens
        """
        noun_phrases = []
        for index in indices:
            noun_phrase_tokens = []
            adj_phrase_tokens = []
            has_noun = False
            for i in range(index + 1, num_tokens):
                token, tag = pos_tags[i]
                if tag == 'DT' or tag == 'CD':      # If the current token is article or cardinal, then we don't count it as a part of noun phrase.
                    if has_noun is True: break
                    else: continue
                elif tag in ['NN', 'NNS', 'NNP', 'NNPS']:   # Obviously
                    noun_phrase_tokens.append(token)
                    has_noun = True
                elif tag == 'JJ':       # If the current token is adjective, it is the sign of the beginning of a noun phrases.
                    if has_noun is True: break      # If some nouns appear before the adjective token, it is not correct noun phrase --> therefore, stop.
                    adj_phrase_tokens.append(token)
                elif tag == 'IN':     # If current token is preposition
                    if has_noun is True and token == 'of': # Consider the "of" preposition in the noun phrase
                        noun_phrase_tokens.append(token)
                    else: break
                elif tag == 'VBG':      # If current token is gerund, it might be a noun
                    if i + 1 == num_tokens: continue
                    next_token, next_tag = pos_tags[i+1]
                    if i < 1 and next_tag in ['NN', 'NNS', 'NNP', 'NNPS']:
                        noun_phrase_tokens.append(token)
                        continue
                    prev_token, prev_tag = pos_tags[i-1] 
                    if prev_tag == 'IN' and prev_token == 'of' or next_tag in ['NN', 'NNS', 'NNP', 'NNPS']:
                        noun_phrase_tokens.append(token)
                else: break
            noun_phrase = ' '.join(noun_phrase_tokens)
            adj_phrase = ' '.join(adj_phrase_tokens)
            if len(adj_phrase_tokens) > 0 and len(noun_phrase) > 0:
                noun_phrase = adj_phrase + ' ' + noun_phrase
            if len(noun_phrase) > 0:
                noun_phrases.append(noun_phrase)
            else: continue 
        return noun_phrases
    num_tokens = len(pos_tags)
    noun_phrases = []

    # Brute force to find noun phrases
    indices = [i-1 for i, item in enumerate(pos_tags) if item[1] in ['NN', 'NNS', 'NNP', 'NNPS'] or item[1] in ['JJ', 'JJR', 'JJS'] or item[1] == 'VBG']
    noun_phrases += parse_noun_phrase(indices, num_tokens)
    # Tokenize all possible tokens whose tags are noun
    single_nouns = [item[0] for item in pos_tags if item[1] in ['NN', 'NNS', 'NNP', 'NNPS'] and item[0] not in noun_phrases]
    noun_phrases += single_nouns
    return noun_phrases

In [None]:
# %%
def analyse(parsed_tokens):
    filtered_tokens = [token for token in parsed_tokens if token in time_dict or token in location_dict or token in vc_dict]    # Compare the noun phrases with dictionaries to find matches terms
    filtered_tokens = sorted(filtered_tokens, key=lambda x: len(x), reverse=True)
    token_counter = Counter(filtered_tokens)
    minus_counter = {}
    for token in token_counter.keys():
        minus_counter[token] = 0;
    tagged_tokens = []
    for token, cnt in token_counter.items():
        cnt += minus_counter[token]
        if cnt == 0: continue
        word_tokens = nltk.word_tokenize(token)
        if len(word_tokens) > 1:
            for wtoken in word_tokens: # Reduce the number of single nouns which are included in a matched noun noun_phrases
                try:
                    minus_counter[wtoken] -= 1
                except: continue
        if token in time_dict:
            tagged_tokens.append((token, 'TIME', cnt))
        if token in location_dict:
            tagged_tokens.append((token, 'LOCATION', cnt))
        if token in vc_dict:
            tagged_tokens.append((token, 'CONCEPT', cnt))
    return tagged_tokens

In [None]:
def preprocess_tokens(noun_phrases):
    processed_words = []
    porter_stem = PorterStemmer()
    for word in noun_phrases:
        tokens = nltk.word_tokenize(word.lower())
        refined_tokenize_list = [unicodedata.normalize('NFKD', token).encode('ascii', 'ignore').decode('utf-8', 'ignore') for token in tokens]
        refined_tokenize_list = [word for word in refined_tokenize_list if word not in stop_words]
        refined_tokenize_list = [porter_stem.stem(word) for word in refined_tokenize_list]
        refined_tokenize_list = [word for word in refined_tokenize_list if len(word) > 1 and word.isalpha()]
        complete_word = ' '.join(refined_tokenize_list)
        processed_words.append(complete_word)
    return processed_words