In [12]:
import os
import nltk
import re

import numpy as np

from collections import OrderedDict
from gensim import corpora, models
from IPython.display import display
from ipywidgets import IntProgress

In [2]:
documents = OrderedDict()

for name in os.listdir('/Users/andrewlamb/Google_Drive/Stanford/CS199/DB_SelfPaced/Flat'):
    with open('/Users/andrewlamb/Google_Drive/Stanford/CS199/DB_SelfPaced/Flat/' + name) as f:
        lines = []
        for line in f:
            lines.append(line.strip('| \n'))
        documents[name] = (' '.join(lines))
        
keywords = {}
stemmer = nltk.stem.PorterStemmer()

for name in os.listdir('/Users/andrewlamb/Google_Drive/Stanford/CS199/DB_SelfPaced/Keys'):
    with open('/Users/andrewlamb/Google_Drive/Stanford/CS199/DB_SelfPaced/Keys/' + name) as f:
        keys = []
        for line in f:
            key = line.strip()
            key = key.replace('-', ' ')
            key = ' '.join(stemmer.stem(token) for token in nltk.word_tokenize(key))
            key = key.lower()

            keys.append(key)
            
        keywords[name] = keys

In [10]:
tagged_documents = OrderedDict()
prog = IntProgress(min=0, max=len(documents))
display(prog)

os.environ['CLASSPATH'] = '/Users/andrewlamb/Downloads/stanford-postagger-2015-12-09'
os.environ['STANFORD_MODELS'] = '/Users/andrewlamb/Downloads/stanford-postagger-2015-12-09'
tagger = nltk.tag.StanfordPOSTagger('models/english-bidirectional-distsim.tagger')

for name, doc in documents.items():
    sents = [nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(doc)]
    tagged_documents[name] = tagger.tag_sents(sents)
    prog.value += 1

In [81]:
def extract_tags(tree):
    results = []
    
    if type(tree) == nltk.tree.Tree:
        if tree.label() == 'NP':
            phrase = ' '.join([word for word, _pos in tree])
            phrase = phrase.lower()
            results.append(phrase)

        for child in tree:
            results.extend(extract_tags(child))
    
    return results

grammar = '''
NP: {(<JJ>|<JJS>|<JJR>)*(<NN>|<NNS>|<NNP>|<NNPS>)+} 
'''

cp = nltk.RegexpParser(grammar)
candidates = set()

chunked_docs = OrderedDict()

for name, doc in tagged_documents.items():
    chunked_doc = []
    for sent in doc:
        result = cp.parse(sent)
        chunked_doc.extend(extract_tags(result))
    chunked_docs[name] = chunked_doc
    
# Remove stopwords
stopwords = set()
with open('/Users/andrewlamb/Google_Drive/Stanford/CS199/RAKE-tutorial/SmartStoplistAdditional.txt') as f:
    for line in f:
        stopwords.add(line.strip())
        
stopped_docs = OrderedDict()
for name, doc in chunked_docs.items():
    stopped_docs[name] = [token for token in doc if len(token) > 2 and token not in stopwords]

stemmer = nltk.stem.PorterStemmer()
stemmed_docs = OrderedDict()
for name, doc in stopped_docs.items():
    stemmed_doc = []
    for chunk in doc:
        stemmed_doc.append(' '.join([stemmer.stem(word) for word in nltk.word_tokenize(chunk)]))

    stemmed_docs[name] = stemmed_doc

    
dictionary = corpora.Dictionary(stemmed_docs.values())
corpus = [dictionary.doc2bow(text) for text in stemmed_docs.values()]

ldamodel = models.ldamodel.LdaModel(corpus, num_topics=20, id2word=dictionary, passes=10)

In [107]:
extracted = {}

for name, bow in zip(documents.keys(), corpus):
    extracted[name.replace('txt', 'keys')] = [dictionary[ident] for ident, prob in ldamodel[bow]]

In [112]:
matches = {}
jaccard = {}
partials = {}
partial_jaccard = {}


for name, keys in keywords.items():
    matches[name.replace('.keys', '')] = list(set(keys).intersection(set(extracted[name])))
    jaccard[name.replace('.keys', '')] = float(len(set(keys).intersection(set(extracted[name])))) / len(set(keys).union(set(extracted[name])))
    
    print(extracted[name])
    print(keys)
    
    inner_partials = []
    for manual_key in extracted[name]:
        manual_tokens = nltk.word_tokenize(manual_key)
        
        
        for key in keys:
            tokens = nltk.word_tokenize(key)
            
            if len(set(manual_tokens).intersection(set(tokens))) > 0:
                inner_partials.append((manual_key, key)) 
                break
    
    partials[name.replace('.keys', '')] = inner_partials
    partial_jaccard[name.replace('.keys', '')] = float(len(inner_partials)) / (len(keys) + len(extracted[name]) - len(inner_partials))
    
print('Average jaccard: {}'.format(sum(jaccard.values()) / len(jaccard)))
print('Average partial jaccard: {}'.format(sum(partial_jaccard.values()) / len(partial_jaccard)))

[u'multipl user', u'aspect', u'disk', u'gener form', u'data base system']
[u'databas manag system', u'physic data independ', u'high level queri languag', u'data model', u'relat data model', u'xml', u'schema', u'data definit languag', u'data manipul languag', u'databas design', u'databas administr']
[u'databas implement', u'data base system']
[u'json', u'semi structur data', u'relat data model', u'object', u'label valu pair', u'schema', u'self describ data']
[u'edg', u'aspect', u'data base system']
[u'queri', u'queri languag', u'ad hoc queri', u'sql', u'data manipul languag', u'closur of the languag']
[u'data base system']
[u'json', u'array', u'object', u'label valu pair', u'schema', u'composit object', u'syntact correct', u'constraint', u'json schema', u'valid', u'properti', u'semant correct', u'enumer constraint']
[u'same data']
[u'relat model', u'databas manag system', u'tabl', u'attribut', u'tupl', u'row', u'type', u'domain', u'schema', u'null', u'key']
Average jaccard: 0.0
Average 