# Kunlin_CSC594-910Online_FinalProject_Demo

## Section 0: Import Libraries

In [10]:
import nltk
from nltk import word_tokenize, sent_tokenize, pos_tag
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
import gensim
from gensim.models import Word2Vec
from bs4 import BeautifulSoup
import os
import re
import pandas as pd
from pandas import ExcelWriter
from pandas import ExcelFile
import numpy as np

## Section 1: Read In Data

In [1]:
def chunk_wikitext(text_oh):
    #text = nltk.clean_html(text)
    text_oh = BeautifulSoup(text_oh, 'html.parser')
    text = text_oh.get_text()
    text = re.sub(r'\[\[.*?:.*?\]\]', '', text)    # remove interwiki links
    template_depth = 0
    in_link = in_ext_link = False
    link_words = []
    pieces = []
    for word in nltk.wordpunct_tokenize(text):
        # skip everything in template braces
        if '{{' in word:
            template_depth += 1
        elif '}}' in word:
            template_depth -= 1
        elif template_depth == 0:
            # detect links
            if '[[' in word:
                in_link = True
            elif ']]' in word:
                # at the end of a link, output it as a named entity chunk
                if link_words:
                    pieces.append(nltk.Tree('NE', link_words))
                    link_words = []
                    in_link = False
            elif '[' in word:
                # start over if the link has | marking an alternate name
                in_ext_link = True
            elif in_ext_link and ']' in word:
                in_ext_link = False
            elif '|' in word and in_link:
                link_words = []
            else:
                if in_link:
                    link_words.append(word)
                elif not in_ext_link:
                    pieces.append(word)
    return nltk.Tree('S', pieces)


# FINAL USED TO READ IN THE TEXT DATA
def readin_text_nohtml_notree(path, limit_n = 50000):
    '''input: path for original text from the datset file
    output: word tokenized sentences from all the files in one list
    needed library: os, re, BeautifulSoup'''
    files = list()
    n = 0
    for filename in os.listdir(path):
        n += 1
        if n < limit_n:
            with open(os.path.join(path,filename), 'rU') as file:
                text = file.read()
                text_tree = chunk_wikitext(text)
                text_notree = ' '.join(word for word in text_tree.leaves())
                sentences = nltk.sent_tokenize(text_notree) 
                sentences = [nltk.word_tokenize(sent) for sent in sentences] 
                files += sentences
        else: break
    return files

In [5]:
%%time
path = 'articles/'
demo_data = readin_text_nohtml_notree(path, limit_n=5000)



CPU times: user 1min 41s, sys: 2.24 s, total: 1min 43s
Wall time: 1min 54s


## Section 2: Knowlege Extraction Function

In [6]:
def grammar_VBN(sent): 
    '''dataset: lists of word tokenized sentences'''
    
    # Second part: used as verb, past participle (VBN)
    # IBM <NP> is/are <VB> located <VBN> at <IN> Chicago <NE>
    # IBM <NP>, <,> located <VBN> at <IN> Chicago <NE>, is a good company.
    # IBM <NP>, <,> which <WDT> is/are located at Chicago
    
    # IBM, a Chicago based company, is good.
    # Chicago <NE> based <VBN> company <NN>, <,> IBM <NP> is good.
    # Located <VBN> in <IN> Chicago <NE>, <,> IBM <NP> is a good company.
    
    VBN_list = ['located','sited','placed','headquartered','positioned',
                'stationed','situated','replaced',
                'Located','Sited','Placed','Headquartered','Positioned',
                'Stationed','Situated','Replaced']
    VBN_list_tag = [(word,"VBN") for word in VBN_list]
    verb_list_tag = [('is','VBZ'),('are','VBP')]
    
    VBN_grammar = r"""
    NE: {<NNP|NNPS>+(<,><NNP|NNPS>)*(<IN><NNP|NNPS>)*}
    NEP: {<DT>?(<JJ>*<NN>*<IN>)?<NE>}
    NP1: {<JJ>*<CD>?<NN.*>+<POS>?<NN.*>*<CD>?}
    NP2: {<''><''><NN.*><POS><''>}
    VB: {<VBZ|VBP>}
    CLAUSE: {<DT>?<NP1|NP2><,>?<WDT>?<VB>?<VBN><IN><NEP>}
    {((<NEP><VBN><NN.*>)|(<VBN><IN><NEP>))<,>?<DT>?<NP1|NP2>}
    """
    
    cp = nltk.RegexpParser(VBN_grammar)
    return_list = list()
    X_Y = list()
    leave2 = str()
    leave3 = str()
    if set(sent).intersection(set(VBN_list)):
        tags = pos_tag(sent)
        tree = cp.parse(tags)
        for subtree in tree.subtrees():
            if subtree.label() == 'CLAUSE': 
                leave = subtree.leaves()
                if set(leave).intersection(set(VBN_list_tag)) and set(leave).intersection(set(verb_list_tag)):
                    return_list.append(leave)
                    break
    return return_list


def grammar_special_VBPZ(sent): 
    '''dataset: lists of word tokenized sentences'''
    
    # Fourth part: other special verbs (lie, sit) used as verb(VB & VBP & VBZ)
    special_VBPZ_list = ['lie','sit','lies','sits']
    special_VBPZ_list_tag = [('lie','VBP'),('sit','VBP'),
                             ('lies','VBZ'),('sits','VBZ')]
    special_VBPZ_grammar = r"""
    NE: {<NNP|NNPS>+(<,><NNP|NNPS>)*(<IN><NNP|NNPS>)*}
    NEP: {<DT>?(<JJ>*<NN>*<IN>)?<NE>}
    NP1: {<JJ>*<CD>?<NN.*>+<POS>?<NN.*>*<CD>?}
    NP2: {<''><''><NN.*><POS><''>}
    VB: {<VBZ|VBP>}
    CLAUSE: {<DT>?<NP1|NP2>(<,>?<WDT>?)<VB><IN><NEP>}
    """
    cp = nltk.RegexpParser(special_VBPZ_grammar)
    return_list = list()
    
    if set(sent).intersection(set(special_VBPZ_list)):
        tags = pos_tag(sent)
        tree = cp.parse(tags)
        for subtree in tree.subtrees():
            if subtree.label() == 'CLAUSE': 
                leave = subtree.leaves()
                if set(leave).intersection(set(special_VBPZ_list_tag)):
                    return_list.append(leave)
                    break
    return return_list
    

def grammar_special_VBG(sent): 
    '''dataset: lists of word tokenized sentences'''
    
    # Fifth part: other special verbs (lie, sit) used as verb, gerund(VBG)
    special_VBG_list = ['lying','sitting','Lying','Sitting']
    special_VBG_list_tag = [(word,'VBG') for word in special_VBG_list]
    special_VBG_grammar = r"""
    NE: {<NNP|NNPS>+(<,><NNP|NNPS>)*(<IN><NNP|NNPS>)*}
    NEP: {<DT>?(<JJ>*<NN>*<IN>)?<NE>}
    NP1: {<JJ>*<CD>?<NN.*>+<POS>?<NN.*>*<CD>?}
    NP2: {<''><''><NN.*><POS><''>}
    VB: {<VBZ|VBP>}
    CLAUSE: {<DT>?<NP1|NP2><,>?(<WDT><VB>)?<VBG><IN><NEP>}
    {<VBG><IN><NEP><,><DT>?<NP1|NP2>}
    """

    cp = nltk.RegexpParser(special_VBG_grammar)
    return_list = list()
    if set(sent).intersection(set(special_VBG_list)):
        tags = pos_tag(sent)
        tree = cp.parse(tags)
        for subtree in tree.subtrees():
            if subtree.label() == 'CLAUSE': 
                leave = subtree.leaves()
                if set(leave).intersection(set(special_VBG_list_tag)):
                    return_list.append(leave)
                    break
    return return_list


def location_extract(dataset):
    return_list = list()
    
    for sent in dataset:
        list1 = grammar_VBN(sent)
        list2 = grammar_special_VBPZ(sent)
        list3 = grammar_special_VBG(sent)
        list_totle = [list1, list2, list3]
        for sublist in list_totle:
            if sublist:
                return_list += sublist
    return return_list

In [7]:
%%time
sample_demo = location_extract(demo_data)

CPU times: user 3min 34s, sys: 2.55 s, total: 3min 37s
Wall time: 4min 2s


In [8]:
sample_demo[0]

[('This', 'DT'),
 ('tunnel', 'NN'),
 ('is', 'VBZ'),
 ('located', 'VBN'),
 ('at', 'IN'),
 ('the', 'DT'),
 ('Great', 'NNP'),
 ('Dividing', 'NNP'),
 ('Range', 'NNP')]

In [11]:
df = pd.DataFrame({'Sentence':sample_demo})
 
writer = ExcelWriter('Sample_demo.xlsx')
df.to_excel(writer,'Sheet1',index=False)
writer.save()