In [1]:
import os
import nltk
import re

import numpy as np

from collections import OrderedDict, defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from IPython.display import display
from ipywidgets import IntProgress

In [2]:
documents = OrderedDict()

for name in os.listdir('/Users/andrewlamb/Google_Drive/Stanford/CS199/DB_SelfPaced/Flat'):
    with open('/Users/andrewlamb/Google_Drive/Stanford/CS199/DB_SelfPaced/Flat/' + name) as f:
        lines = []
        for line in f:
            lines.append(line.strip('| \n'))
        documents[name] = (' '.join(lines))
        
keywords = defaultdict(set)
stemmer = nltk.stem.PorterStemmer()

for name in os.listdir('/Users/andrewlamb/Google_Drive/Stanford/CS199/studyguides/data/keywords'):
    with open('/Users/andrewlamb/Google_Drive/Stanford/CS199/studyguides/data/keywords/' + name) as f:
        for line in f:
            lecture_name, key, _, rank = line.strip().split(',')
            rank = int(rank)
            if rank > -1:
                key = key.replace('-', ' ')
                key = ' '.join(stemmer.stem(token) for token in nltk.word_tokenize(key))
                key = key.lower()
                keywords[lecture_name].add(key)

In [3]:
tagged_documents = OrderedDict()
prog = IntProgress(min=0, max=len(documents))
display(prog)

os.environ['CLASSPATH'] = '/Users/andrewlamb/Downloads/stanford-postagger-full-2015-12-09'
os.environ['STANFORD_MODELS'] = '/Users/andrewlamb/Downloads/stanford-postagger-full-2015-12-09'
tagger = nltk.tag.StanfordPOSTagger('models/english-bidirectional-distsim.tagger')

for name, doc in documents.items():
    sents = [nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(doc)]
    tagged_documents[name] = tagger.tag_sents(sents)
    prog.value += 1

In [29]:
def extract_tags(tree):
    results = []
    
    if type(tree) == nltk.tree.Tree:
        if tree.label() == 'NP':
            phrase = ' '.join([word for word, _pos in tree])
            phrase = phrase.lower()
            phrase = re.sub("[().,']", '', phrase)
            phrase = re.sub('-', ' ', phrase)
            results.append(phrase)

        for child in tree:
            results.extend(extract_tags(child))
    
    return results

grammar = '''
NP: {(<JJ>|<JJS>|<JJR>)*(<NN>|<NNS>|<NNP>|<NNPS>)+} 
'''

cp = nltk.RegexpParser(grammar)
candidates = []

chunked_docs = []

for doc in tagged_documents.values():
    chunked_doc = []
    for sent in doc:
        result = cp.parse(sent)
        chunked_doc.extend(extract_tags(result))
        candidates.extend(extract_tags(result))
    chunked_docs.append(chunked_doc)
    
stopwords = []
with open('/Users/andrewlamb/Google_Drive/Stanford/CS199/RAKE-tutorial/SmartStoplistAdditional.txt') as f:
    for line in f:
        stopwords.append(line.strip())
        
def filter_stopwords(candidate):
    tokens = nltk.word_tokenize(candidate)
    return not (tokens[0] in stopwords or tokens[-1] in stopwords)
    
candidates = list(set(candidates))
candidates = filter(lambda candidate: len(candidate) > 2, candidates)
candidates = filter(filter_stopwords, candidates)

In [36]:
print(len(candidates))
import random
print(np.random.choice(candidates, size=20, replace=False))
print(sorted(candidates, key=lambda candidate: len(candidate)))

4819
[u'huge industry' u'matching values' u'levels' u'priority field'
 u'hash tables' u'query tomorrow' u'crash recovery' u'months'
 u'data manipulation language' u'table t 1' u'quick summary'
 u'access information' u'document model' u'fixes' u'today' u'objects kind'
 u'exact term' u'alternate notations' u'approximate solutions' u'mixture']


In [None]:
with open('vocab.txt', 'w') as f:
    for candidate in candidates:
        f.write(candidate)
        f.write('\n')

In [None]:
trimmed_candidates = []
for candidate in candidates:
    if max(len(token) for token in candidate.split(' ')) > 6:
        trimmed_candidates.append(candidate)

In [None]:
stopwords = []
with open('/Users/andrewlamb/Google_Drive/Stanford/CS199/RAKE-tutorial/SmartStoplistAdditional.txt') as f:
    for line in f:
        stopwords.append(line.strip())
    
stemmed_documents = OrderedDict()

for name, document in documents.items():
    stemmed_document = document.replace('-', ' ')
    stemmed_document = ' '.join([stemmer.stem(token) for token in nltk.word_tokenize(document)])
    stemmed_documents[name] = stemmed_document.lower()

extractor = TfidfVectorizer(ngram_range=(1,4), vocabulary=candidates, norm=None, smooth_idf=False, stop_words=stopwords)
X = extractor.fit_transform(stemmed_documents.values())
inverse_voc = {v: k for k, v in extractor.vocabulary_.items()}

extracted = {}
for doc_idx, name in enumerate(documents.keys()):
    extracted[name] = [inverse_voc[idx] for idx in np.argsort(X[doc_idx,:].toarray().flatten())]

In [None]:
matches = OrderedDict()
jaccard = OrderedDict()

for name, keys in sorted(keywords.items()):
    extracted_words = extracted[name][-len(keys):]
    matches[name.replace('.keys', '')] = list(set(keys).intersection(extracted_words))
    jaccard[name.replace('.keys', '')] = float(len(set(keys).intersection(extracted_words))) / len(set(keys).union(set(extracted_words)))
    
print('Average jaccard: {}'.format(sum(jaccard.values()) / len(jaccard)))

In [None]:
matches = OrderedDict()
jaccard = OrderedDict()

for name, keys in sorted(keywords.items()):
    extracted_words = extracted[name][-2*len(keys):]
    matches[name.replace('.keys', '')] = list(set(keys).intersection(extracted_words))
    jaccard[name.replace('.keys', '')] = float(len(set(keys).intersection(extracted_words))) / len(set(keys).union(set(extracted_words)))
    
print('Average jaccard: {}'.format(sum(jaccard.values()) / len(jaccard)))