In [1]:
import os
import nltk
import re

import numpy as np

from collections import OrderedDict
from sklearn.feature_extraction.text import TfidfVectorizer
from IPython.display import display
from ipywidgets import IntProgress

In [2]:
documents = OrderedDict()

for name in os.listdir('/Users/andrewlamb/Google_Drive/Stanford/CS199/DB_SelfPaced/Flat'):
    with open('/Users/andrewlamb/Google_Drive/Stanford/CS199/DB_SelfPaced/Flat/' + name) as f:
        lines = []
        for line in f:
            lines.append(line.strip('| \n'))
        documents[name] = (' '.join(lines))
        
keywords = {}
stemmer = nltk.stem.PorterStemmer()

for name in os.listdir('/Users/andrewlamb/Google_Drive/Stanford/CS199/DB_SelfPaced/Keys'):
    with open('/Users/andrewlamb/Google_Drive/Stanford/CS199/DB_SelfPaced/Keys/' + name) as f:
        keys = []
        for line in f:
            key = line.strip()
            key = key.replace('-', ' ')
            key = ' '.join(stemmer.stem(token) for token in nltk.word_tokenize(key))
            key = key.lower()

            keys.append(key)
            
        keywords[name] = keys

In [3]:
tagged_documents = OrderedDict()
prog = IntProgress(min=0, max=len(documents))
display(prog)

os.environ['CLASSPATH'] = '/Users/andrewlamb/Downloads/stanford-postagger-2015-12-09'
os.environ['STANFORD_MODELS'] = '/Users/andrewlamb/Downloads/stanford-postagger-2015-12-09'
tagger = nltk.tag.StanfordPOSTagger('models/english-bidirectional-distsim.tagger')

for name, doc in documents.items():
    sents = [nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(doc)]
    tagged_documents[name] = tagger.tag_sents(sents)
    prog.value += 1

In [16]:
def extract_tags(tree):
    results = []
    
    if type(tree) == nltk.tree.Tree:
        if tree.label() == 'NP':
            phrase = ' '.join([word for word, _pos in tree])
            phrase = phrase.lower()
            phrase = re.sub("[().,']", '', phrase)
            phrase = re.sub('-', ' ', phrase)
            results.append(phrase)

        for child in tree:
            results.extend(extract_tags(child))
    
    return results

grammar = '''
NP: {(<JJ>|<JJS>|<JJR>)*(<NN>|<NNS>|<NNP>|<NNPS>)+} 
'''

cp = nltk.RegexpParser(grammar)
candidates = []

chunked_docs = []

for doc in tagged_documents.values():
    chunked_doc = []
    for sent in doc:
        result = cp.parse(sent)
        chunked_doc.extend(extract_tags(result))
        candidates.extend(extract_tags(result))
    chunked_docs.append(chunked_doc)
    
candidates = list(set(candidates))

In [17]:
for i, candidate in enumerate(candidates):
    candidates[i] = ' '.join([stemmer.stem(token) for token in nltk.word_tokenize(candidate)])
    
candidates = list(set(candidates))

In [19]:
stopwords = []
with open('/Users/andrewlamb/Google_Drive/Stanford/CS199/RAKE-tutorial/SmartStoplistAdditional.txt') as f:
    for line in f:
        stopwords.append(line.strip())
    
stemmed_documents = OrderedDict()

for name, document in documents.items():
    stemmed_document = document.replace('-', ' ')
    stemmed_document = ' '.join([stemmer.stem(token) for token in nltk.word_tokenize(document)])
    stemmed_documents[name] = stemmed_document.lower()

extractor = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,4), vocabulary=candidates)
X = extractor.fit_transform(stemmed_documents.values())
inverse_voc = {v: k for k, v in extractor.vocabulary_.items()}

extracted = {}
for doc_idx, (name, keys) in enumerate(keywords.items()):
    extracted[name] = [inverse_voc[idx] for idx in np.argsort(X[doc_idx,:].toarray().flatten())]

In [20]:
matches = {}
jaccard = {}
partials = {}
partial_jaccard = {}


for name, keys in keywords.items():
    extracted_words = extracted[name][-len(keys):]
    matches[name.replace('.keys', '')] = list(set(keys).intersection(extracted_words))
    jaccard[name.replace('.keys', '')] = float(len(set(keys).intersection(extracted_words))) / len(set(keys).union(set(extracted_words)))
    
#     inner_partials = []
#     for manual_key in extracted[name]:
#         manual_tokens = nltk.word_tokenize(manual_key)
        
        
#         for key in keys:
#             tokens = nltk.word_tokenize(key)
            
#             if len(set(manual_tokens).intersection(set(tokens))) > 0:
#                 inner_partials.append((manual_key, key)) 
#                 break
    
#     partials[name.replace('.keys', '')] = inner_partials
#     partial_jaccard[name.replace('.keys', '')] = float(len(inner_partials)) / (len(keys) + len(extracted[name]) - len(inner_partials))
    
print('Average jaccard: {}'.format(sum(jaccard.values()) / len(jaccard)))
# print('Average partial jaccard: {}'.format(sum(partial_jaccard.values()) / len(partial_jaccard)))

Average jaccard: 0.0381818181818


In [21]:
matches = {}
jaccard = {}
partials = {}
partial_jaccard = {}


for name, keys in keywords.items():
    extracted_words = extracted[name][-2*len(keys):]
    matches[name.replace('.keys', '')] = list(set(keys).intersection(extracted_words))
    jaccard[name.replace('.keys', '')] = float(len(set(keys).intersection(extracted_words))) / len(set(keys).union(set(extracted_words)))
    
#     inner_partials = []
#     for manual_key in extracted[name]:
#         manual_tokens = nltk.word_tokenize(manual_key)
        
        
#         for key in keys:
#             tokens = nltk.word_tokenize(key)
            
#             if len(set(manual_tokens).intersection(set(tokens))) > 0:
#                 inner_partials.append((manual_key, key)) 
#                 break
    
#     partials[name.replace('.keys', '')] = inner_partials
#     partial_jaccard[name.replace('.keys', '')] = float(len(inner_partials)) / (len(keys) + len(extracted[name]) - len(inner_partials))
    
print('Average jaccard: {}'.format(sum(jaccard.values()) / len(jaccard)))
# print('Average partial jaccard: {}'.format(sum(partial_jaccard.values()) / len(partial_jaccard)))

Average jaccard: 0.0581663837012
