Roger Mei

In [317]:
# import libraries
import os
import nltk
from nltk.tokenize import word_tokenize,sent_tokenize,RegexpTokenizer
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import spacy
import gensim
from gensim import corpora
from gensim.summarization.bm25 import *
from nltk.corpus import wordnet as wn
from rake_nltk import Rake
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# initializing stop words 
stop_words = set(stopwords.words('english'))

# Reading in the corpus:

In [12]:
# file paths
path_13 = './2013'
path_14 = './2014'
path_train = './training/'

# load training set
text_2013 = []
text_2014 = []

# reading 2013 texts
for file in os.listdir(path_13):
    text_2013.append(open(os.path.join(path_13,file),'rb').read().decode("utf-8",errors="replace"))
    
# reading 2014 texts
for file in os.listdir(path_14):
    text_2014.append(open(os.path.join(path_14,file),'rb').read().decode("utf-8",errors="replace"))
    
# Combining both 2013 and 2014 texts into one list
all_text = text_2013 + text_2014

# word tokenizing all documents in corpus:
def tokenize_texts(all_text):
    '''
    input:
        all_text - list of str: list of all documents
    return:
        tokenized_texts - list of list of str: list of word tokenized documents
    '''
    tokenized_texts = []
    for text in all_text:
        tokenized_texts.append(word_tokenize(text))
    return tokenized_texts
        
tokenized_texts = tokenize_texts(all_text)
print(len(tokenized_texts))

730


# Creating document retrieval hashmap: 

In [241]:
# creating document retrieval hashmap
def create_hashmap(all_text):    
    hashmap = {}
    '''
    input:
        all_text - list of str: list of all documents
    return:
        hashmap - dict: dictionary of word, document index pairs that map a word to documents containing that word
    '''
    for i in range(len(all_text)):
        words_in_doc = set(word_tokenize(all_text[i].lower()))
        words_in_doc = [word for word in words_in_doc if word not in stop_words]
        for word in words_in_doc:
            if not hashmap.get(word):                
                hashmap[word] = [i]
            else:
                hashmap[word].append(i)
    return hashmap

# previewing data in hashmap     
hashmap = create_hashmap(all_text)
iterator = iter(hashmap.items())
for i in range(3):
    print(next(iterator))

('6.5', [0, 1, 2, 3, 8, 16, 17, 21, 22, 23, 24, 25, 26, 28, 29, 37, 38, 39, 40, 41, 43, 45, 47, 48, 50, 54, 57, 58, 60, 64, 65, 66, 68, 71, 72, 76, 82, 83, 86, 87, 88, 94, 95, 98, 107, 108, 109, 111, 116, 118, 119, 126, 129, 131, 132, 134, 136, 137, 144, 145, 153, 155, 156, 158, 159, 160, 161, 162, 163, 168, 170, 172, 174, 175, 176, 177, 180, 181, 183, 197, 198, 200, 201, 203, 208, 209, 210, 216, 218, 220, 221, 222, 224, 226, 227, 228, 233, 235, 238, 239, 243, 244, 248, 255, 256, 261, 264, 266, 267, 270, 273, 281, 285, 291, 292, 294, 297, 299, 303, 304, 305, 306, 307, 309, 312, 314, 318, 327, 328, 333, 334, 335, 337, 338, 340, 341, 343, 344, 353, 355, 359, 363, 371, 376, 379, 381, 383, 384, 385, 386, 390, 393, 400, 404, 408, 415, 416, 418, 423, 431, 433, 437, 438, 440, 443, 451, 453, 458, 459, 463, 466, 468, 469, 470, 471, 473, 481, 483, 485, 489, 494, 495, 496, 498, 502, 504, 505, 509, 511, 512, 513, 517, 518, 523, 528, 531, 532, 536, 538, 541, 542, 544, 545, 548, 551, 557, 560, 562, 

# Question type classification:

•Questions containing “Who” or “Whom” are taken to be of type **Person**.  
•Questions starting with “Where” are related to type  **Location**.  
•Questions starting with “When” , “What time” or “What date” are related to type **Date/Time**.   
•Questions starting with “How few”, “How great”, “How little”, “How many” or “How much” are taken to be of type **Quantity**.  
•Questions starting with "Which company/companies" are of type **Organization**.  
•Questions starting with "What" without any other information are of type **Noun**.

After document retrieval, only sentences that include the specified named entities will be considered as answers.

In [437]:
# question type classification
def question_class(q):
    '''
    input:
        q - str: question to be asked
    return:
        classification - list of str: classifications of types of answers expected as a spacy named entity
    '''
    q = word_tokenize(q.lower())
    classification = []
    if ("who" == q[0]) or ("whom" == q[0]):
        classification = ['PERSON']
    elif ("where") == q[0]:
        classification = ['LOC','GPE']
    elif ("when" == q[0]) or ("what" == q[0] and q[1] in ['date','time']):
        classification = ["TIME","DATE"]
    elif ("how" == q[0]) and (q[1] in ['few','great','little','many','much']):
        classification = ['QUANTITY','MONEY','PERCENT']    
    elif ("which" == q[0]) and (q[1] in ['company','companies']):
        classification = ['ORG']
    elif ("what" == q[0]) and (q[1] in ['%','percent','percents','percentage','percentages']):
        classification = ['PERCENT']
    # answers are nouns
    elif "what" == q[0]:
        classification = ['NN']
    return classification

Examples:

In [378]:
print(question_class("How many points did the S&P decrease by?"))
print(question_class("What percentage of drop or increase is associated with this property?"))

['QUANTITY', 'MONEY', 'PERCENT']
['PERCENT']


# Extracting keywords from question: 

<u>selected keywords</u>  
•All non-stop words of a quoted expression    
•All name entities, recognized as proper nouns    
•All noun and their adjectival modifiers    
•For example: “What is the name of the "female" counterpart to El Nino, which results in cooling temperatures and very dry weather?”      
 --Keywords : name/ female/ counterpart/ El Nino / cooling/ temperatures/ dry/ weather/

<u>Special cases for business type questions:</u>  
•“organization” is dropped and replaced as most organizations mentioned in the documents may contain “Co.”, “Ltd.” but not the word “organization” itself  
•“people” should be kept and expanded to include “residents” or “populations” or “citizens”  

In [475]:
def keyword_extract(q):
    '''
    input:
        q - str: question to be asked
    return:
        filtered_result - list of str: keywords found in question
    '''
    result = []
    
    # extract keywords
    r = Rake()
    r.extract_keywords_from_text(q)
    keywords = set(r.get_ranked_phrases())
    # if more than one word in a string, split the string
    for kw in keywords:
        if " " in kw:
            split_kw = kw.split()
            for word in split_kw:
                result.append(word)
        else:
            result.append(kw)
    
    # remove verbs
    filtered_result = []
    tags = nltk.pos_tag(word_tokenize(q))
    for tag in tags:
        if 'VB' not in tag[1] or tag[1] == 'VBG':
            if tag[0].lower() in result:
                filtered_result.append(tag[0].lower())
     
    return filtered_result

Examples:

In [469]:
print("pos tags:")
print(nltk.pos_tag(word_tokenize("What is the name of the \"female\" counterpart to El Nino, which results in cooling temperatures and very dry weather?")))
# pos_tag incorrectly tags "results" as a plural noun

print("\nkeywords:")
extracted = keyword_extract("What is the name of the \"female\" counterpart to El Nino, which results in cooling temperatures and very dry weather?")
print(extracted)

keyword_extract("What companies went bankrupt in February 2012?")

pos tags:
[('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('name', 'NN'), ('of', 'IN'), ('the', 'DT'), ('``', '``'), ('female', 'JJ'), ("''", "''"), ('counterpart', 'NN'), ('to', 'TO'), ('El', 'NNP'), ('Nino', 'NNP'), (',', ','), ('which', 'WDT'), ('results', 'NNS'), ('in', 'IN'), ('cooling', 'VBG'), ('temperatures', 'NNS'), ('and', 'CC'), ('very', 'RB'), ('dry', 'JJ'), ('weather', 'NN'), ('?', '.')]

keywords:
['name', 'female', 'counterpart', 'El', 'Nino', 'results', 'cooling', 'temperatures', 'dry', 'weather']


['companies', 'bankrupt', 'February', '2012']

# Document selection:

filter to select only documents that contain extracted keywords:

In [189]:
def doc_select(keywords,hashmap):
    '''
    input:
        keywords - list of str: keywords found in question
        hashmap - dict: dictionary of word, document index pairs that map a word to documents containing that word
    return:
        docs - list of int: indices of documents to be used for answer
    '''
    docs = []
    for k in keywords:
        try:
            docs += hashmap[k]
        except:
            continue
    return list(set(docs))

Example:

In [207]:
keywords = keyword_extract("Who is the CEO of Tesla?")
print(keywords)
print(doc_select(keywords,hashmap))

['tesla', 'ceo']
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 32, 33, 35, 36, 37, 38, 39, 40, 41, 43, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 57, 58, 59, 60, 61, 63, 64, 65, 66, 67, 68, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 97, 98, 99, 100, 101, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 129, 130, 131, 132, 133, 134, 136, 137, 138, 140, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 173, 174, 175, 176, 178, 179, 180, 181, 182, 183, 184, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 238, 239, 240, 

# Okapi BM25:

On selected documents, select 5 documents with best okapi scores based on the BM25 algorithm.

<img src="okapi.png" style="height:200px">

In [508]:
def okapi_scoring(q,tokenized_texts,hashmap):
    '''
    input:
        q - str: question to score each document against
        tokenized_texts - list of list of str: list of word tokenized documents to be scored
        hashmap - dict: dictionary of word, document index pairs that map a word to documents containing that word
    return:
        best_docs - list of int: list of indices of the 5 most relevant document to the question based on okapi scores
    '''
    # setup
    keywords = keyword_extract(q)
    docs = doc_select(keywords,hashmap)
    filtered_texts = [tokenized_texts[i] for i in docs]
    # scoring
    bm25 = BM25(filtered_texts)
    query = word_tokenize(q)
    scores = bm25.get_scores(query)
    best_docs = []
    
    for idx in list(np.argsort(scores)[-5:]):
        best_docs.append(docs[idx])

    return best_docs

Example:

In [495]:
okapi_scoring("Who is the CEO of company Tesla?",tokenized_texts,hashmap)

[384, 675, 107, 697, 4]

# Sentence selection:

Among the 5 best documents, choose the most relevant sentence based on cosine similarity to question asked
(returns both answers and sentence for debugging purposes):

In [441]:
def sentence_select(q,docs):
    '''
    input:
        q - str: question to score each sentence against
        docs - list of str: list of documents to select sentence from
    return:
        answers - list of str: most relevant answers to query
 
    '''
    best_sent = ""
    best_cs = 0
    
    q_type = question_class(q)
    for doc in docs:
        # sentence tokenize document
        sentences = sent_tokenize(doc)
        filtered_sentences = []

        # keep only sentences with wanted entities
        nlp = spacy.load('en')
        if len(q_type) and q_type[0] != 'NN':
            for sent in sentences:
                doc_obj = nlp(sent)
                for ent in doc_obj.ents:
                    if ent.label_ in q_type:
                        filtered_sentences.append(sent)
        else:
            filtered_sentences = sentences

        # compute cosine similarity of question with each sentence
        tfidf_vectorizer = TfidfVectorizer(analyzer="char")
        for sent in filtered_sentences:
            compared_docs = (q,sent) 
            tfidf_matrix = tfidf_vectorizer.fit_transform(compared_docs)
            cs = cosine_similarity(tfidf_matrix[0:1],tfidf_matrix)
            if cs[0][1] > best_cs:
                best_cs = cs[0][1]
                best_sent = sent
                
    # extract answers from best sentence
    answers = []
    if len(q_type) and q_type[0] != 'NN':
        best_obj = nlp(best_sent)
        for ent in best_obj.ents:
            if ent.label_ in q_type:
                answers.append(ent.text)
    elif q_type[0] == 'NN':
        tags = nltk.pos_tag(word_tokenize(best_sent))
        for tag in tags:
            if 'NN' in tag[1]:
                answers.append(tag[0])
    else:
        answers.append(best_sent)
        
    # sentence is also returned for debugging purposes
    return answers, best_sent

Examples:

In [493]:
for company in ["Tesla","Apple","Berkshire Hathaway"]:    
    docs = []
    q = "Who is the CEO of company " + company + "?"
    for idx in okapi_scoring(q,tokenized_texts,hashmap):
        docs.append(all_text[idx])
    print(company,":")
    answer,sent = sentence_select(q,docs)
    print(answer)
    print(sent)
    print()

Tesla :
['Elon Musk', 'Tesla']
Elon Musk is CEO of Tesla and chairman of Solar City, so he's been shedding net worth.�How much net worth?

Apple :
['Tim Cook', 'Einhorn']
Apple CEO Tim Cook has said he would take a look at Einhorn's proposal.

Berkshire Hathaway :
['Buffett']
Sharp drops in many of the stocks owned by Buffett's Berkshire Hathaway in recent weeks hit the sprawling conglomerate's equity portfolio hard.



In [490]:
for date in ["August of 2013","January of 2014","May of 2014"]:
    docs = []
    q = "Which companies went bankrupt in " + date + "?"
    for idx in okapi_scoring(q,tokenized_texts,hashmap):
        docs.append(all_text[idx])
    print(date,":")
    answer,sent = sentence_select(q,docs)
    print(answer)
    print(sent)
    print()

August of 2013 :
(['Icahn', 'Tropicana Casino', 'Resort'], "Icahn also owns Atlantic City's Tropicana Casino and Resort, which he bought out of bankruptcy in 2010.")

January of 2014 :
(['McPhail', 'Superconductor', 'Superconductor', 'AMSC'], 'McPhail tipped the various defendants on other occasions, funneling them inside information about American Superconductor�s quarterly earnings announcements in July and September 2009, and again in January 2010.� He also alerted them in the fall of 2009 to a contract worth $100 million, and in November 2010 to a likely drop in American Superconductor�s share�s price, which occurred a few days later when AMSC announced a secondary stock offering.')

May of 2014 :
(['Ackman', 'Herbalife', 'FTC', 'Ackman'], 'Ackman said his bet against Herbalife is notionally larger than it was when he first initiated his short on the company, and if the company went out of business today, Ackman would make about $2 billion.� In March, the�FTC launched a formal inve

In [509]:
docs = []
for idx in okapi_scoring("What affects GDP?",tokenized_texts,hashmap):
    docs.append(all_text[idx])
    
sentence_select("What affects GDP?",docs)

(['end', 'Pascal', 'Wager'],
 "What we forget at the end of all this is Pascal's Wager.")

In [511]:
for prop in ['unemployment','interest rates','poverty']:
    docs = []
    q = "What percentage of drop or increase in GDP is associated with " + prop + "?"
    for idx in okapi_scoring(q,tokenized_texts,hashmap):
        docs.append(all_text[idx])
    print(prop,":")
    answer,sent = sentence_select(q,docs)
    print(answer)
    print(sent)
    print()

unemployment :
['3 percent']
Economists say a growth pace in excess of 3 percent would be needed over a sustained period to significantly lower high unemployment.

interest rates :
['37 percent', '24 percent']
Looking ahead, 37 percent of respondents anticipate further increases in raw materials prices over the next six months, while 24 percent expect higher finished goods prices.

poverty :
['0.8 percent']
Weakness in German industrial output and both domestic and foreign orders have pointed to a poor April-June period after 0.8 percent expansion in the first three months of the year, the fastest rate in three years.

