In [2]:
import sys
import numpy
import nltk
import nltk.data
import collections
import json

In [3]:
import string
from nltk.corpus import stopwords

In [4]:
# Setup
# nlp = StanfordCoreNLP('stanford-english-corenlp-2018-10-05-models')
sent_detector = nltk.data.load("tokenizers/punkt/english.pickle")

In [5]:
# Hardcoded word lists
yesnowords = ["can", "could", "would", "is", "does", "has", "was", "were", "had", "have", "did", "are", "will"]
commonwords = ["the", "a", "an", "is", "are", "were", "."]
questionwords = ["who", "what", "where", "when", "why", "how", "whose", "which", "whom"]

In [6]:
# Take in a tokenized question and return the question type and body
def processquestion(qwords):
    
    # Find "question word" (what, who, where, etc.)
    questionword = ""
    qidx = -1
    
#     qwords = preprocess(qwords)

    for (idx, word) in enumerate(qwords):
        if word.lower() in questionwords:
            questionword = word.lower()
            qidx = idx
            break
        elif word.lower() in yesnowords:
            return ("YESNO", qwords)

    if qidx < 0:
        return ("MISC", qwords)

    if qidx > len(qwords) - 3:
        target = qwords[:qidx]
    else:
        target = qwords[qidx+1:]
    type = "MISC"

    # Determine question type
    if questionword in ["who", "whose", "whom"]:
        type = "PERSON"
    elif questionword == "where":
        type = "PLACE"
    elif questionword == "when":
        type = "TIME"
    elif questionword == "how":
        if target[0] in ["few", "little", "much", "many"]:
            type = "QUANTITY"
            target = target[1:]
        elif target[0] in ["young", "old", "long"]:
            type = "TIME"
            target = target[1:]

    # Trim possible extra helper verb
    if questionword == "which":
        target = target[1:]
    if target[0] in yesnowords:
        target = target[1:]
    
    # Return question data
    return (type, target)

In [62]:
def preprocess(sent):
    sent = sent.translate(str.maketrans('', '', string.punctuation))
    
    stop_words = stopwords.words('english')
    sent = nltk.word_tokenize(sent)
    sent = [word for word in sent if word not in stop_words]
#    sent = nltk.pos_tag(sent)
    
    return sent  

In [121]:
q = "Who is Raden Ajeng Kartini?"
kocak = processquestion(preprocess(q)) # ["who", "Raden", "Ajeng", "Kartini"]
kocak

('PERSON', ['Raden', 'Ajeng', 'Kartini'])

In [80]:
questionPOS = nltk.pos_tag(preprocess(q))
questionPOS

[('Who', 'WP'), ('Raden', 'NNP'), ('Ajeng', 'NNP'), ('Kartini', 'NNP')]

In [120]:
# Process question
(type, target) = processquestion(preprocess(q))
target

['Raden', 'Ajeng', 'Kartini']

In [119]:
# Get sentence keywords
searchwords = set(target)
dict = collections.Counter()
searchwords

{'Ajeng', 'Kartini', 'Raden'}

In [122]:
with open('train.txt', 'r') as f:
    doc = f.read()        

In [123]:
type

'PERSON'

In [124]:
def word_count(words, doc):
    counts = collections.Counter()
#     words = words.split()
    sentence = nltk.sent_tokenize(doc)
    for (i, sent) in enumerate(sentence):
        sentwords = nltk.word_tokenize(sent)
        wordmatches = set(filter(set(searchwords).__contains__, sentwords))
        counts[sent] = len(wordmatches)
                
    return counts

In [128]:
relevant = word_count(target, doc).most_common(4)
relevant

[('Raden Ajeng Kartini was born on 21 April 1879 in Jepara.', 3),
 ('Ibu Kartini was very concerned because of education in Indonesia especially for women.',
  1),
 ("They were amazed with Kartini's fluency in Dutch.", 1),
 ('Kartini had a book to read from Mrs. Ovink and started to correspond with people in the Netherlands.',
  1)]

In [103]:
wordmatches =len(set(filter(set(searchwords).__contains__, nltk.word_tokenize(doc))))
wordmatches

3

In [88]:
# Find most relevant sentences
for (i, sent) in enumerate(doc):           
    sentwords = nltk.word_tokenize(sent)
    wordmatches = set(filter(set(searchwords).__contains__, sentwords))
    print(len(wordmatches))
#     print(len(wordmatches))
    dict[sent] = len(wordmatches)
sent

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


'.'

In [46]:
done = False

for (sentence, matches) in dict.most_common(10):
    tagged = nltk.pos_tag(nltk.word_tokenize(sentence))
    #this line would tag Named entities
    tagged = nltk.ne_chunk(tagged)
    
    #basic chunking of NPs
    grammar = "NP: {<DT>?<JJ>*<NN>}"
    cp = nltk.RegexpParser(grammar)
    result = cp.parse(tagged)
    
#     parse = json.loads(corenlp.parse(sentence))
    sentencePOS = nltk.pos_tag(nltk.word_tokenize(sentence))

    # Attempt to find matching substrings
    searchstring = ' '.join(target)
    if searchstring in sentence:
        startidx = sentence.index(target[0])
        endidx = sentence.index(target[-1])
        answer = sentence[:startidx]
        done = True
    
    # Check if solution is found
    if done:
        continue

    # Check by question type
    answer = ""
    for worddata in result["sentences"][0]["words"]:
            
        # Mentioned in the question
        if worddata[0] in searchwords:
            continue
            
        if type == "PERSON":
            if worddata[1]["NamedEntityTag"] == "PERSON":
                answer = answer + " " + worddata[0]
                done = True
            elif done:
                break

        if type == "PLACE":
            if worddata[1]["NamedEntityTag"] == "LOCATION":
                answer = answer + " " + worddata[0]
                done = True
            elif done:
                break

        if type == "QUANTITY":
            if worddata[1]["NamedEntityTag"] == "NUMBER":
                answer = answer + " " + worddata[0]
                done = True
            elif done:
                break

        if type == "TIME":
            if worddata[1]["NamedEntityTag"] == "NUMBER":
                answer = answer + " " + worddata[0]
                done = True
            elif done:
                answer = answer + " " + worddata[0]
                break
            
if done:
    print (answer)

if not done:
    (answer, matches) = dict.most_common(1)[0]
    print (answer)

TypeError: Tree indices must be integers, not str

''