In [1]:
super_compound_phrases_list = []

In [2]:
from IPython.display import HTML
from itertools import product
import pandas as pd
import urllib.request, json
import urllib.parse # For encoding the URL string to UTF-8
import random
import time
import csv
import re

from bs4 import BeautifulSoup
import en_core_web_sm
import contractions
import unidecode
import requests
import string
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Don't collapse Pandas Dataframes:
pd.set_option('display.max_rows', None, 'display.max_columns', None)

In [3]:
# ## Import Training Data (Text Corpus) via input URL:
# url = 'https://en.wikipedia.org/wiki/Music_theory'
# headers = {
#     'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/W.X.Y.Z Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
#           }
# req = requests.get(url, headers)
# soup = BeautifulSoup(req.content, 'html.parser')
# training_text = soup.get_text()
# print(training_text)

In [4]:
## Import Training Data (Text Corpus) via input File:
with open('training-text.txt', encoding='utf-8') as input_file:
    training_text = input_file.read()
sample_training_text = training_text[0:2000]
print(sample_training_text)

  `Anyone who has not worked for them
     simply cannot understand them.'

         - Mille Vennamun, introduction to:
          `The Use of Ashes: Bureau of Procuration Manual'

  Half past eight.  The bedside alarm woke Kelanie up with the
sampled victory-screech of some carnivorous xenoform.  She was up
immediately, eyes wide, fingers clawing the pillow-pads, gasping with
shock as the subconsciously-induced adrenalin shivered through her
system.   As she calmed down, her pupils dilated out from crisis-
induced pinpricks, her breathing and pulse rates returned to normal,
and she wondered, not for the first or last time, if life was like
this in the private sector.   She scrambled off the bed as it began
to deflate and retract into the wall.
  Her personalised holographic news service activated as she stepped
into the shower.   It took the appearance of an old man dressed in a
monk's habit, who bore a strong resemblance to William S. Burroughs.
It leered at her, and croaked,
  `Rough

In [5]:
training_text_list = []
training_text_list.append(training_text)
training_text_list



In [6]:
############################################################
############################################################
## Clean Training Data: (Text Pre-processing)
def text_preprocessor(text):
    ###########################################
    soup = BeautifulSoup(text, 'html.parser') ## Remove HTML
    text = soup.get_text(separator=' ') ## Remove HTML
    ###########################################
    text = text.lower() ## Lowercase Characters
    text = contractions.fix(text) ## Expand Contractions ("don't" -> "do not")
    text = re.sub(r'https?:\S*', '', text) ## Remove URLs
    text = re.sub(r'@\S*', '', text) ## Remove Twitter Mentions
    text = re.sub(r'#\S*', '', text) ## Remove Hashtags
    text = re.sub(r'[^a-zA-z0-9.,!?/:;\"\'\s]', '', text) ## Remove special characters (e.g: %, $, &, etc.)
    text = text.translate(str.maketrans('', '', string.punctuation)) ## Remove Punctuation Characters
    text = re.sub(r'[0-9]+', '', text) ## Remove Numerical Characters
    text = unidecode.unidecode(text) ## Normalized accented characters (ñ -> n)
    ###########################################
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text) # no emoji
    ###########################################
    text = word_tokenize(text) ## Tokenize Text    
    stop_words = set(stopwords.words('english')) ## Get Stop Words
    stop_words_exclusion = ['no','not','nor'] ## Stop Word Exclusion List
    stop_words = [word for word in stop_words if word not in stop_words_exclusion] ## Remove Stop Word Exclusions from Stop Words
    text = [word for word in text if word not in stop_words] ## Remove Stop Words
    ###########################################
    # ps = PorterStemmer() ## Stemming: ['wait', 'waiting', 'waited', 'waits'] -> 'wait'
    # text = [ps.stem(word) for word in text] ## Apply Word Stemming
    wnl = WordNetLemmatizer() ## Lemmatization: 'studies' -> 'study'; 'studying' -> 'studying'
    text = [wnl.lemmatize(word) for word in text] ## Apply Word Lemmatization
    
    return text

In [7]:
combined_super_df = pd.DataFrame()

for training_text in training_text_list:
    text = training_text
    
    nlp = en_core_web_sm.load() 
    doc = nlp(text[:900000]) ###<<<<<<<<<<<<<<###<<<<<<<<<<<<<<###<<<<<<<<<<<<<<###<<<<<<<<<<<<<<###<<<<<<<<<<<<<<
    array = [(X, X.ent_iob_, X.ent_type_) for X in doc]
    df = pd.DataFrame(array, columns=['word','iob','entity type'])
    description_dict = {'B':'beginning of an entity',
                        'I':'inside of an entity',
                        'O':'outside of an entity'}
    df['iob description'] = df['iob'].map(description_dict)
    description_dict = {'PERSON':'people, including fictional',
                        'NORP':'nationalities, religious or political groups',
                        'FAC':'buildings, airports, highways, bridges, etc',
                        'ORG':'companies, agencies, institutions, etc',
                        'GPE':'countries, cities, states',
                        'LOC':'non-GPE locations, mountain ranges, bodies of water',
                        'PRODUCT':'objects, vehicles, foods, etc (not services)',
                        'EVENT':'named hurricanes, battles, wars, sports events, etc',
                        'WORK_OF_ART':'titles of books, songs, etc',
                        'LAW':'named documents made into laws',
                        'LANGUAGE':'any named language',
                        'DATE':'absolute or relative dates or periods',
                        'TIME':'times smaller than a day',
                        'PERCENT':'percentage',
                        'MONEY':'monetary values',
                        'QUANTITY':'measurements, as of weight or distances',
                        'ORDINAL':'first, second, etc',
                        'CARDINAL':'numerals that do not fall under another type'
                       }
    df['entity type description'] = df['entity type'].map(description_dict)
    df = df[df['iob'].str.contains('O') == False]
    # df.head(100)
    
    entity_type_list = []
    compound_phrase_list = []
    iob_compound_list = []
    prev_iob = 'B'
    for index, row in df.iterrows():
        curr_iob = row['iob']
        word = row['word']
        entity_type = row['entity type']
        # print(iob)
        if prev_iob == 'B' and curr_iob == 'B':
            prev_iob = curr_iob
            try:
                list_of_strings = [i.text for i in iob_compound_list]
                # print(list_of_strings)
                list_of_compound_phrases = ' '.join(list_of_strings)
            except:
                pass
            compound_phrase_list.append(list_of_compound_phrases)
            entity_type_list.append(entity_type)
            iob_compound_list = []
            iob_compound_list.append(word)
            # print(iob_compound_list)
        elif prev_iob == 'B' and curr_iob == 'I':
            prev_iob = curr_iob
            iob_compound_list.append(word)
            # print(iob_compound_list)
        elif prev_iob == 'I' and curr_iob == 'I':
            prev_iob = curr_iob
            iob_compound_list.append(word)
            # print(iob_compound_list)
        elif prev_iob == 'I' and curr_iob == 'B':
            prev_iob = curr_iob
            try:
                list_of_strings = [i.text for i in iob_compound_list]
                # print(list_of_strings)
                list_of_compound_phrases = ' '.join(list_of_strings)
            except:
                pass
            compound_phrase_list.append(list_of_compound_phrases)
            entity_type_list.append(entity_type)
            iob_compound_list = []
            iob_compound_list.append(word)
            # print(iob_compound_list)
        else:
            pass
    
    compound_phrase_list = compound_phrase_list[1:] # Removes empty value from start of list
    compound_phrase_df = pd.DataFrame(compound_phrase_list, columns=['compound phrase'])
    compound_phrase_entity_df = pd.DataFrame(entity_type_list, columns=['entity type'])
    result_df = pd.concat([compound_phrase_df, compound_phrase_entity_df], axis=1)
    # result_df.drop_duplicates().sort_values(by='compound phrase', ascending=True).head(1000)
    
    compound_phrases_df = pd.DataFrame()
    # compound_phrases_df['compound phrase'] = result_df[~result_df['compound phrase'].str.contains('^\w*$')]
    result_df = result_df.dropna() # Removes any NaN values
    compound_phrases_df = result_df[~result_df['compound phrase'].str.match('^\w*$')] # Removes single words from the results to get compound queries

    compound_phrases_df = compound_phrases_df['compound phrase'].sort_values(ascending=True)
    compound_phrases_df = compound_phrases_df.drop_duplicates()

    compound_phrases_list = compound_phrases_df.tolist()
    # print(compound_phrases_list)
    prepocessed_compound_phrase_list = []
    for compound_phrase in compound_phrases_list:
        text = compound_phrase
        ###########################################
        soup = BeautifulSoup(text, 'html.parser') ## Remove HTML
        text = soup.get_text(separator=' ') ## Remove HTML
        ###########################################
        text = text.lower() ## Lowercase Characters
        #text = contractions.fix(text) ## Expand Contractions ("don't" -> "do not")
        #text = re.sub(r'https?:\S*', '', text) ## Remove URLs
        #text = re.sub(r'@\S*', '', text) ## Remove Twitter Mentions
        #text = re.sub(r'#\S*', '', text) ## Remove Hashtags
        text = re.sub(r'[^a-zA-z0-9.,!?/:;\"\'\s]', '', text) ## Remove special characters (e.g: %, $, &, etc.)
        text = text.replace('\n','')
        text = text.translate(str.maketrans('', '', string.punctuation)) ## Remove Punctuation Characters
        # text = re.sub(r'[0-9]+', '', text) ## Remove Numerical Characters
        text = unidecode.unidecode(text) ## Normalized accented characters (ñ -> n)
        ###########################################
        emoji_pattern = re.compile("["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               "]+", flags=re.UNICODE)
        text = emoji_pattern.sub(r'', text) # no emoji
        ###########################################
        # text = word_tokenize(text) ## Tokenize Text    
        # stop_words = set(stopwords.words('english')) ## Get Stop Words
        # stop_words_exclusion = ['no','not','nor'] ## Stop Word Exclusion List
        # stop_words = [word for word in stop_words if word not in stop_words_exclusion] ## Remove Stop Word Exclusions from Stop Words
        # text = [word for word in text if word not in stop_words] ## Remove Stop Words
        ###########################################
        # ps = PorterStemmer() ## Stemming: ['wait', 'waiting', 'waited', 'waits'] -> 'wait'
        # text = [ps.stem(word) for word in text] ## Apply Word Stemming
        # wnl = WordNetLemmatizer() ## Lemmatization: 'studies' -> 'study'; 'studying' -> 'studying'
        # text = [wnl.lemmatize(word) for word in text] ## Apply Word Lemmatization
        ###########################################
        text = text.strip()
        text = text.replace('  ',' ')
        text = text.replace('  ',' ')
        prepocessed_compound_phrase_list.append(text)
        
        new_df = pd.DataFrame(prepocessed_compound_phrase_list)
    new_df = new_df[~new_df[0].str.match('^\w*$')] # Removes single words from the results to get compound queries
    prepocessed_compound_phrase_list = new_df[0].tolist()
    print(prepocessed_compound_phrase_list)
    super_compound_phrases_list.append(prepocessed_compound_phrase_list)

['1730 marek', '180 degree', '195 km sec', '35 minutes 45 seconds', '40 hours', '978 percent', 'alexander pope', 'aln riker', 'barber xeno', 'baylal delvoy kendr teff', 'bureau of procuration', 'chapter five', 'chapter one the export', 'chapter two', 'church of the subgenius', 'circle within the circle', 'comoncurensy isotope', 'diplomatic exchange', 'earth diplomatic exchange', 'earth export', 'gaeren tuuri', 'guidance ai', 'handelsman tsiry feylen', 'interspecies advisory', 'kayren kayley', 'kayren kayley', 'kelanie s', 'kelanie and marek', 'kendr saranaxio', 'kendr saranaxio parndta athanasius', 'kendr saranaxio', 'maracite information exchange registry', 'marek waddell', 'marek keery', 'marek waddell', 'martini baton', 'mileva barker', 'millimillenarian technological control', 'miss camden', 'moridani   phandric', 'moridani cause', 'moridani partisan', 'n frf bla g', 'n frf knh k', 'n frf knh k', 'n svw tre a', 'nikolai kingsley', 'ninety years', 'plateau bythian', 'plateau bythian

In [8]:
flat_super_compound_phrases_list = [item for sublist in super_compound_phrases_list for item in sublist]
flat_super_compound_phrases_list = list(set(flat_super_compound_phrases_list))
print(flat_super_compound_phrases_list)

['kelanie and marek', 'twenty seven days', 'twelve years old', 'about an hour', 'one hundred percent', 'three feet', 'the use of ashes bureau of procuration manual  half past eight', 'earth diplomatic exchange', 'terrestrial anglic', 'some two hours', 'the underground networks', 'n frf knh k', 'li svayene', 'kelanie s', 'millimillenarian technological control', 'about three meters', 'half an hour', 'three thousand years ago', 'about a metre', 'this mortal coil  you', 'two metre tall', 'some twenty hours later', 'miss camden', 'nine thirty', 'up yours ugly', 'minus two', 'about fifty', 'a few moments', 'ninety years', 'a few metres', 'a few minutes', 'just under two metres', 'the office of threat termination', 'three quarters', 'quite nice ice cream parlour', 'pre millennium', 'her work credit hour meter', 'nikolai kingsley', 'robyn starkey', 'rik mayall', 'interspecies advisory', '180 degree', 'nine years', 'six foot', '978 percent', 'finding free data channel please wait', 'this after