In [1]:
import os
import nltk
import re

import numpy as np

from collections import OrderedDict, defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from IPython.display import display
from ipywidgets import IntProgress

In [2]:
documents = OrderedDict()

for name in os.listdir('../../../data/Introduction_to_Databases_Captions'):
    with open('../../../data/Introduction_to_Databases_Captions/' + name) as f:
        lines = []
        for line in f:
            lines.append(line.strip('| \n'))
        documents[name] = ' '.join(lines)
        
keywords = defaultdict(set)
stemmer = nltk.stem.PorterStemmer()

for name in os.listdir('../../../data/keywords/'):
    with open('../../../data/keywords/' + name) as f:
        for line in f:
            lecture_name, key, _, rank = line.strip().split(',')
            rank = int(rank)
            key = key.replace('-', ' ')
            key = ' '.join(stemmer.stem(token) for token in nltk.word_tokenize(key))
            key = key.lower()
            keywords[lecture_name].add(key)

In [3]:
tagged_documents = OrderedDict()
prog = IntProgress(min=0, max=len(documents))
display(prog)

os.environ['CLASSPATH'] = '../../../scripts/stanford-postagger-2015-12-09'
os.environ['STANFORD_MODELS'] = '../../../scripts/stanford-postagger-2015-12-09'
tagger = nltk.tag.StanfordPOSTagger('models/english-bidirectional-distsim.tagger')

for name, doc in documents.items():
    sents = [nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(doc)]
    tagged_documents[name] = tagger.tag_sents(sents)
    prog.value += 1

No Java runtime present, requesting install.



OSError: Java command failed : [u'/usr/bin/java', '-mx1000m', '-cp', '../../../scripts/stanford-postagger-2015-12-09/stanford-postagger.jar', 'edu.stanford.nlp.tagger.maxent.MaxentTagger', '-model', '../../../scripts/stanford-postagger-2015-12-09/models/english-bidirectional-distsim.tagger', '-textFile', '/var/folders/mc/wrw4y4fn2_n3rc77t7gzn9bm0000gn/T/tmpfH6tD3', '-tokenize', 'false', '-outputFormatOptions', 'keepEmptySentences', '-encoding', 'utf8']

In [None]:
def extract_tags(tree):
    results = []
    
    if type(tree) == nltk.tree.Tree:
        if tree.label() == 'NP':
            phrase = ' '.join([word for word, _pos in tree])
            phrase = phrase.lower()
            phrase = re.sub("[().,']", '', phrase)
            phrase = re.sub('-', ' ', phrase)
            results.append(phrase)

        for child in tree:
            results.extend(extract_tags(child))
    
    return results

grammar = '''
NP: {(<JJ>|<JJS>|<JJR>)*(<NN>|<NNS>|<NNP>|<NNPS>)+} 
'''

cp = nltk.RegexpParser(grammar)
candidates = []

chunked_docs = []

for doc in tagged_documents.values():
    chunked_doc = []
    for sent in doc:
        result = cp.parse(sent)
        chunked_doc.extend(extract_tags(result))
        candidates.extend(extract_tags(result))
    chunked_docs.append(chunked_doc)
    
candidates = list(set(candidates))

In [None]:
stopwords = []
with open('/Users/andrewlamb/Google_Drive/Stanford/CS199/RAKE-tutorial/SmartStoplistAdditional.txt') as f:
    for line in f:
        stopword = line.strip()
        stemmed = ' '.join([stemmer.stem(token) for token in nltk.word_tokenize(stopword)])
        stopwords.append(stemmed)
    
stemmed_documents = OrderedDict()

for name, document in documents.items():
    stemmed_document = document.replace('-', ' ')
    stemmed_document = ' '.join([stemmer.stem(token) for token in nltk.word_tokenize(document)])
    stemmed_documents[name] = stemmed_document.lower()

extractor = TfidfVectorizer(vocabulary=candidates, stop_words=stopwords)
X = extractor.fit_transform(stemmed_documents.values())
inverse_voc = {v: k for k, v in extractor.vocabulary_.items()}

extracted = {}
for doc_idx, name in enumerate(documents.keys()):
    extracted[name] = [inverse_voc[idx] for idx in np.argsort(X[doc_idx,:].toarray().flatten())]

In [None]:
matches = OrderedDict()
jaccard = OrderedDict()

for name, keys in sorted(keywords.items()):
    extracted_words = extracted[name][-len(keys):]
    matches[name.replace('.keys', '')] = list(set(keys).intersection(extracted_words))
    jaccard[name.replace('.keys', '')] = float(len(set(keys).intersection(extracted_words))) / len(set(keys).union(set(extracted_words)))
    
print('Average jaccard: {}'.format(sum(jaccard.values()) / len(jaccard)))

In [None]:
matches = OrderedDict()
jaccard = OrderedDict()

for name, keys in sorted(keywords.items()):
    extracted_words = extracted[name][-2*len(keys):]
    matches[name.replace('.keys', '')] = list(set(keys).intersection(extracted_words))
    jaccard[name.replace('.keys', '')] = float(len(set(keys).intersection(extracted_words))) / len(set(keys).union(set(extracted_words)))
    
print('Average jaccard: {}'.format(sum(jaccard.values()) / len(jaccard)))

In [None]:
with open('tfidf_chunk_keys.csv', 'w') as f:
    for document, keys in sorted(extracted.items()):
        keys = list(reversed(keys))
        for i, key in enumerate(keys[:10]):
            if i < 5:
                f.write('{},{},{}\n'.format(document, key, i + 1))
            else:
                f.write('{},{},{}\n'.format(document, key, 0))