In [1]:
import os
import nltk

import numpy as np

from collections import OrderedDict, defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [28]:
documents = OrderedDict()

for name in os.listdir('/Users/andrewlamb/Google_Drive/Stanford/CS199/DB_SelfPaced/Flat'):
    with open('/Users/andrewlamb/Google_Drive/Stanford/CS199/DB_SelfPaced/Flat/' + name) as f:
        lines = []
        for line in f:
            lines.append(line.strip('| \n'))
        documents[name] = (' '.join(lines))
        
keywords = defaultdict(set)
stemmer = nltk.stem.PorterStemmer()

for name in os.listdir('/Users/andrewlamb/Google_Drive/Stanford/CS199/studyguides/data/keywords'):
    with open('/Users/andrewlamb/Google_Drive/Stanford/CS199/studyguides/data/keywords/' + name) as f:
        for line in f:
            lecture_name, key, _, rank = line.strip().split(',')
            rank = int(rank)
            if rank > -1:
                key = key.replace('-', ' ')
                key = ' '.join(stemmer.stem(token) for token in nltk.word_tokenize(key))
                key = key.lower()
                keywords[lecture_name].add(key)

In [20]:
stopwords = []
with open('/Users/andrewlamb/Google_Drive/Stanford/CS199/RAKE-tutorial/SmartStoplistAdditional.txt') as f:
    for line in f:
        stopwords.append(line.strip())
    
stemmed_documents = OrderedDict()

for name, document in documents.items():
    document = document.replace('-', ' ')
    document = ' '.join([stemmer.stem(token) for token in nltk.word_tokenize(document)])
    stemmed_documents[name] = document.lower()

extractor = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,4), norm=None, smooth_idf=False)
X = extractor.fit_transform(stemmed_documents.values())
inverse_voc = {v: k for k, v in extractor.vocabulary_.items()}

extracted = OrderedDict()
for doc_idx, name in enumerate(stemmed_documents.keys()):
    extracted[name] = [inverse_voc[idx] for idx in np.argsort(X[doc_idx,:].toarray().flatten())]

In [52]:
extractor.vocabulary_

{u'creat thi claus': 34304,
 u'ha gpa greater 95': 64457,
 u'applic concern': 9878,
 u'onli view databas': 92447,
 u'shape ve join': 118841,
 u'approach thi': 10457,
 u'match isbn attribut': 84131,
 u'back ran queri queri': 14672,
 u'sort static': 122740,
 u'system end user end': 131865,
 u'command onc send': 27987,
 u'element entir': 48662,
 u'state california start show': 126221,
 u'book id ref refer': 18787,
 u'cover implement index': 33840,
 u'enabl user delet': 49520,
 u'applic ve finish acid': 10393,
 u'similar nanci befor tri': 120025,
 u'bob run': 18410,
 u'gpa actual write': 62313,
 u'thi time': 138950,
 u'element similarli befor': 49011,
 u'littl inform book magazin': 78528,
 u'tricki matter trigger activ': 142789,
 u'2011 wa neo4j flat': 619,
 u'data websit suggest download': 37391,
 u'follow subel follow text': 58706,
 u'alway default behavior': 6095,
 u'queri databas ll result': 102042,
 u'json start talk': 74569,
 u'enrol greater maximum': 49944,
 u'veri expens mid west':

In [29]:
matches = OrderedDict()
jaccard = OrderedDict()

for name, keys in sorted(keywords.items()):
    extracted_words = extracted[name][-len(keys):]
    matches[name.replace('.keys', '')] = list(set(keys).intersection(extracted_words))
    jaccard[name.replace('.keys', '')] = float(len(set(keys).intersection(extracted_words))) / len(set(keys).union(set(extracted_words)))
    
print('Average jaccard: {}'.format(sum(jaccard.values()) / len(jaccard)))

Average jaccard: 0.124246903191


In [22]:
matches = OrderedDict()
jaccard = OrderedDict()

for name, keys in sorted(keywords.items()):
    extracted_words = extracted[name][-2*len(keys):]
    matches[name.replace('.keys', '')] = list(set(keys).intersection(extracted_words))
    jaccard[name.replace('.keys', '')] = float(len(set(keys).intersection(extracted_words))) / len(set(keys).union(set(extracted_words)))
    
print('Average jaccard: {}'.format(sum(jaccard.values()) / len(jaccard)))

Average jaccard: 0.107531278382


In [51]:
sorted(matches.items()[6])

[[u'xml schema', u'dtd', u'xsd', u'key', u'valid'], '03_02_XML_Schema.txt']

In [47]:
stemmed_documents.values()[5].count('xml schema')

5

In [50]:
list(reversed(extracted['03_02_DTDs_IDs_and_IDREFs.txt'])).index('xml schema')

50

In [None]:
with open('tfidf_keys.csv', 'w') as f:
    for document, keys in sorted(extracted.items()):
        keys = list(reversed(keys))
        for i, key in enumerate(keys[:10]):
            if i < 5:
                f.write('{},{},{}\n'.format(document, key, i + 1))
            else:
                f.write('{},{},{}\n'.format(document, key, 0))